In [None]:
import json
import joblib
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load necessary NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Set of English stop words
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Tokenize text and remove stopwords."""
    word_tokens = word_tokenize(text)
    return " ".join([w for w in word_tokens if w.lower() not in stop_words and w.isalnum()])

def get_textblob_sentiment(text):
    """Use TextBlob to determine sentiment based on polarity."""
    blob = TextBlob(text)
    return 1 if blob.sentiment.polarity > 0 else 0

# Load and preprocess the data
X_data, y_data = [], []
file_path = '/Users/aqibullah/Downloads/15000.json'
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        json_data = json.loads(line)
        text = json_data.get('reviewText', '')
        overall = json_data.get('overall', 0)
        processed_text = preprocess_text(text)
        sentiment = 1 if overall > 3 else 0  # Using rating to determine sentiment (for comparison)
        X_data.append(processed_text)
        y_data.append(sentiment)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

# No need to vectorize data for TextBlob, directly use the processed text
# Make predictions on the test set using TextBlob
y_pred = [get_textblob_sentiment(text) for text in X_test]

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("TextBlob Model Performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Optionally, save the results using joblib for later review
joblib.dump({'X_test': X_test, 'y_test': y_test, 'y_pred': y_pred}, 'textblob_performance.pkl')

[nltk_data] Downloading package punkt to /Users/aqibullah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aqibullah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


TextBlob Model Performance:
Accuracy: 0.7766666666666666
Precision: 0.8128023818384815
Recall: 0.9289663972777541
F1-Score: 0.8670107185391028


['textblob_performance.pkl']