In [None]:
import json
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import download
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

download('vader_lexicon')  # Make sure the VADER lexicon is downloaded

def preprocess_text(text):
    """Function to preprocess text by lowering the case."""
    return text.lower()

def get_vader_sentiment(text):
    """Use VADER to determine sentiment based on compound score."""
    sid = SentimentIntensityAnalyzer()
    sentiment_score = sid.polarity_scores(text)['compound']
    return 1 if sentiment_score >= 0.05 else 0  # Threshold can be adjusted

# Load your dataset
X_data, y_data = [], []
file_path = '/Users/aqibullah/Downloads/15000.json'  # Update this path
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        json_data = json.loads(line)
        text = json_data.get('reviewText', '')
        processed_text = preprocess_text(text)
        overall = json_data.get('overall', 0)
        sentiment = 1 if overall > 3 else 0  # Example binary sentiment based on rating
        X_data.append(processed_text)
        y_data.append(sentiment)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

# Make predictions using VADER on the test set
y_pred = [get_vader_sentiment(text) for text in X_test]

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("VADER Model Performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/aqibullah/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


VADER Model Performance:
Accuracy: 0.7466666666666667
Precision: 0.8242967794537301
Recall: 0.8600595491280306
F1-Score: 0.8417985012489593
