# Data Loading and Exploration

In [1]:
import pandas as pd

# Load datasets
sentiment140_data = pd.read_csv('new_train_data_s140.csv') 
trustpilot_reviews_data = pd.read_csv('trust_pilot_reviews_data_2022_06.csv') 
twitter_data = pd.read_csv('Twitter Scraping Tweets Dataset.csv')
reviews_data = pd.read_csv('Reviews.csv') 
ratings_beauty_data = pd.read_csv('ratings_beauty.csv')

In [2]:
# Display first few rows of each dataset to check
print("Sentiment140 Data:")
print(sentiment140_data.head())
print("\nTrustpilot Reviews Data:")
print(trustpilot_reviews_data.head())
print("\nTwitter Data:")
print(twitter_data.head())
print("\nReviews Data:")
print(reviews_data.head())
print("\nRatings Beauty Data:")
print(ratings_beauty_data.head())

Sentiment140 Data:
   Polarity          Id                          Date     Query      User  \
0         0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY  mattycus   
1         0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   ElleCTF   
2         0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY    Karoli   
3         0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY  joy_wolf   
4         0  1467811592  Mon Apr 06 22:20:03 PDT 2009  NO_QUERY   mybirch   

                                                Text  
0  @Kenichan I dived many times for the ball. Man...  
1    my whole body feels itchy and like its on fire   
2  @nationwideclass no, it's not behaving at all....  
3                      @Kwesidei not the whole crew   
4                                        Need a hug   

Trustpilot Reviews Data:
        name                 company_url  \
0  Poundshop  https://www.poundshop.com/   
1  Poundshop  https://www.poundshop.com/   
2  Poundshop  https://www.poun

# Data Preprocessing
**Cleaning Text Data**

In [3]:
def preprocess_sentiment140(data):
    data = data[['Polarity', 'Text']]
    return data

sentiment140_preprocessed = preprocess_sentiment140(sentiment140_data)

In [4]:
def preprocess_trustpilot(data):
    data = data[['review_text', 'rating']]
    data.rename(columns={'review_text': 'Text', 'rating': 'Polarity'}, inplace=True)
    data['Polarity'] = data['Polarity'].apply(lambda x: 1 if x > 3 else 0)
    return data

trustpilot_preprocessed = preprocess_trustpilot(trustpilot_reviews_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns={'review_text': 'Text', 'rating': 'Polarity'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Polarity'] = data['Polarity'].apply(lambda x: 1 if x > 3 else 0)


In [5]:
def preprocess_twitter(data):
    data = data[['text', 'label']]
    data.rename(columns={'text': 'Text', 'label': 'Polarity'}, inplace=True)
    return data

twitter_preprocessed = preprocess_twitter(twitter_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns={'text': 'Text', 'label': 'Polarity'}, inplace=True)


In [6]:
def preprocess_reviews(data):
    data = data[['Text', 'Score']]
    data['Polarity'] = data['Score'].apply(lambda x: 1 if x > 3 else 0)
    return data[['Polarity', 'Text']] 
reviews_preprocessed = preprocess_reviews(reviews_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Polarity'] = data['Score'].apply(lambda x: 1 if x > 3 else 0)


In [7]:
def preprocess_ratings_beauty(data):
    data['Polarity'] = data['Rating'].apply(lambda x: 1 if x > 3 else 0)
    return data[['Polarity']]

ratings_beauty_preprocessed = preprocess_ratings_beauty(ratings_beauty_data)

**Save the Preprocessed Datasets**

In [8]:
sentiment140_preprocessed.to_csv('sentiment140_preprocessed.csv', index=False)
trustpilot_preprocessed.to_csv('trustpilot_preprocessed.csv', index=False)
twitter_preprocessed.to_csv('twitter_preprocessed.csv', index=False)
reviews_preprocessed.to_csv('reviews_preprocessed.csv', index=False)
ratings_beauty_preprocessed.to_csv('ratings_beauty_preprocessed.csv', index=False)

#  Data Combination and Shuffling

In [9]:
# Combine datasets (only those with text)
combined_data = pd.concat([
    sentiment140_preprocessed, 
    trustpilot_preprocessed, 
    twitter_preprocessed, 
    reviews_preprocessed
], ignore_index=True)

# Shuffle combined data
combined_data = combined_data.sample(frac=1).reset_index(drop=True)

# Check the columns of the combined dataset
print(combined_data.columns)

Index(['Polarity', 'Text'], dtype='object')


In [10]:
# Check for and handle missing values
print("Missing values before handling:")
print(sentiment140_data.isnull().sum())

# Drop rows with NaN values
sentiment140_data.dropna(inplace=True)


Missing values before handling:
Polarity    0
Id          0
Date        0
Query       0
User        0
Text        0
dtype: int64


# Feature Extraction

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(combined_data['Text'])

# Labels for sentiment
y = combined_data['Polarity']

In [12]:
# Ensure y is numeric
y = pd.to_numeric(y, errors='coerce')

# Drop rows with missing target values (y)
X = X[y.notnull()]
y = y[y.notnull()]

# Train-Test Split and Model Training

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Train the Logistic Regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Evaluate the model
lr_accuracy = lr.score(X_test, y_test)
print(f"Model Accuracy: {lr_accuracy:.2f}")

Model Accuracy: 0.80


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Save the Model and Vectorizer

In [15]:
import joblib

# Save the trained model and vectorizer
joblib.dump(lr, 'sentiment_analysis_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']