In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib

# Load the preprocessed datasets
sentiment140_data = pd.read_csv('cleaned_sentiment140_data.csv')
trustpilot_reviews_data = pd.read_csv('cleaned_trustpilot_reviews_data.csv')
twitter_data = pd.read_csv('cleaned_twitter_data.csv')
reviews_data = pd.read_csv('cleaned_reviews_data.csv')
ratings_beauty_data = pd.read_csv('cleaned_ratings_beauty_data.csv')

In [2]:
# Combine all datasets into one
combined_data = pd.concat([sentiment140_data, trustpilot_reviews_data, twitter_data, reviews_data, ratings_beauty_data], ignore_index=True)

# Display first few rows of combined data to verify
print("Combined Data:")
print(combined_data.head())

Combined Data:
   Polarity            Id                          Date     Query      User  \
0       0.0  1.467811e+09  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY  mattycus   
1       0.0  1.467811e+09  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   ElleCTF   
2       0.0  1.467811e+09  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY    Karoli   
3       0.0  1.467811e+09  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY  joy_wolf   
4       0.0  1.467812e+09  Mon Apr 06 22:20:03 PDT 2009  NO_QUERY   mybirch   

                                                Text sentiment name  \
0  @Kenichan I dived many times for the ball. Man...   neutral  NaN   
1    my whole body feels itchy and like its on fire    neutral  NaN   
2  @nationwideclass no, it's not behaving at all....   neutral  NaN   
3                      @Kwesidei not the whole crew    neutral  NaN   
4                                        Need a hug    neutral  NaN   

  company_url trustpilot_url  ... ProductId UserId ProfileName  \
0         NaN    

In [3]:
# Check for missing values in the 'Text' column
missing_text_count = combined_data['Text'].isnull().sum()
print(f"Missing values in 'Text': {missing_text_count}")
# Drop rows where 'Text' is NaN
combined_data.dropna(subset=['Text'], inplace=True)
# Fill missing values with an empty string
combined_data['Text'].fillna('', inplace=True)
# Verify the number of missing values again
print(f"Missing values in 'Text' after filling: {combined_data['Text'].isnull().sum()}")


Missing values in 'Text': 2027268
Missing values in 'Text' after filling: 0


# Feature Extraction Using TF-IDF

In [4]:
# Convert text into numerical data
tfidf = TfidfVectorizer(max_features=5000) 
X = tfidf.fit_transform(combined_data['Text'])
y = combined_data['sentiment']

# Model Building

In [5]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Model Evaluation

In [7]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Display evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 Score: 0.98


In [8]:
# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.65      0.74     11715
     neutral       0.99      0.99      0.99    321979
    positive       0.95      0.98      0.96     99997

    accuracy                           0.98    433691
   macro avg       0.94      0.87      0.90    433691
weighted avg       0.98      0.98      0.98    433691



# Save the trained model

In [9]:
joblib.dump(model, 'sentiment_analysis_model.pkl')
print("Model saved as 'sentiment_analysis_model.pkl'")

Model saved as 'sentiment_analysis_model.pkl'
