<a href="https://colab.research.google.com/github/Asoub2001/saas-review-model/blob/main/SaaS_Review_Sentiment_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# @title Default title text
import pandas as pd

# Load Amazon reviews dataset
amazon = pd.read_csv('/Reviews.csv')

# Show available columns to inspect
print("Amazon columns:", amazon.columns)
amazon[['Score', 'Text']].head()

Amazon columns: Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')


Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [3]:
# Keep only reviews with text and score
amazon = amazon[['Score', 'Text']].dropna()

# Filter out weird scores (just keep 1 to 5)
amazon = amazon[amazon['Score'].isin([1, 2, 3, 4, 5])]

# Create sentiment column
amazon['sentiment'] = amazon['Score'].apply(
    lambda x: 'Negative' if x <= 2 else ('Neutral' if x == 3 else 'Positive')
)

# Rename for consistency
amazon = amazon.rename(columns={'Text': 'review_text'})
amazon['source'] = 'Amazon'

# Final clean Amazon
amazon_final = amazon[['review_text', 'sentiment', 'source']]

# Show a preview
amazon_final.sample(5)

Unnamed: 0,review_text,sentiment,source
30466,"I am quite the chocolate snob, and while this ...",Neutral,Amazon
463467,I can buy twice the size of this container at ...,Negative,Amazon
387776,Panettone is an Italian Christmas tradition. A...,Neutral,Amazon
273869,"This product, as well as the beef broth, taste...",Positive,Amazon
22295,"If you are a tea lover, this is the one for yo...",Positive,Amazon


In [4]:
# Load Trustpilot reviews
trust = pd.read_csv('/trust_pilot_reviews_data_2022_06.csv')

# Inspect available columns
print("Trustpilot columns:", trust.columns)
trust[['review_text', 'rating']].head()

Trustpilot columns: Index(['name', 'company_url', 'trustpilot_url', 'description', 'author_name',
       'review_title', 'review_text', 'rating', 'reviewed_at', 'uniq_id',
       'scraped_at'],
      dtype='object')


Unnamed: 0,review_text,rating
0,Lowest price in the country for what we wanted...,5
1,got it on time they took notice that I let the...,5
2,"Delivered on time. Products as described, pack...",5
3,Quick and efficient service..Pleased,5
4,Excellent service packed well. Really pleased.,5


In [5]:
# Clean Trustpilot dataset
trust = trust[['review_text', 'rating']].dropna()

# Assign sentiment
trust['sentiment'] = trust['rating'].apply(
    lambda x: 'Negative' if x <= 2 else ('Neutral' if x == 3 else 'Positive')
)
trust['source'] = 'Trustpilot'

# Final clean
trust_final = trust[['review_text', 'sentiment', 'source']]

# Show a preview
trust_final.sample(5)

Unnamed: 0,review_text,sentiment,source
840,"Speaking with Ella on Thursday, to review poli...",Positive,Trustpilot
3471,Excellent quality and looks amazing.,Positive,Trustpilot
2341,"What can I add? Quick, correct and keenly pric...",Positive,Trustpilot
2414,My piccasso silver earrings arrived very quick...,Positive,Trustpilot
1237,If you are thinking of buying this fence or an...,Negative,Trustpilot


In [6]:
# Combine both
combined = pd.concat([amazon_final, trust_final], ignore_index=True)

# Save to file
combined.to_csv('combined_reviews.csv', index=False)


# Show a preview
combined.sample(5)


Unnamed: 0,review_text,sentiment,source
125403,When my next door neighbor moved she left a bo...,Positive,Amazon
156569,Oh yeah! My puppy (3 y/o burmese/border collie...,Positive,Amazon
115669,"Shelled, clean, excellent quality, convenient ...",Positive,Amazon
493204,I am not a fan of flavored coffee and I though...,Negative,Amazon
545859,I was a bit hesitant that the dog would stain ...,Positive,Amazon


In [7]:
import re

# Define a function to clean the review text
def clean_text(text):
    text = text.lower()  # Convert all text to lowercase to standardize
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove numbers and punctuation, keep letters and spaces only
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space and strip leading/trailing spaces
    return text

# Apply the text cleaning function to the review_text column
# First, convert to string type to avoid issues with missing data
combined['clean_text'] = combined['review_text'].astype(str).apply(clean_text)

In [8]:
import nltk

# Download stopwords (common words like "the", "and", etc.)
nltk.download('stopwords')

# Import the list of English stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Define a function to remove stopwords from a sentence
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

# Apply the stopword removal to the 'clean_text' column
combined['clean_text'] = combined['clean_text'].apply(remove_stopwords)

# Save the cleaned and preprocessed dataset to a CSV file
combined.to_csv('cleaned_combined_reviews.csv', index=False)

# Download the cleaned CSV file
from google.colab import files
files.download('cleaned_combined_reviews.csv')

# Show a preview
combined.sample(5)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,review_text,sentiment,source,clean_text
209601,I recently purchased these treats because they...,Neutral,Amazon,recently purchased treats sale however waited ...
337419,"I like a strong, bold, dark roast coffee and D...",Positive,Amazon,like strong bold dark roast coffee double blac...
224624,I orderd and received the cocoa in very good t...,Positive,Amazon,orderd received cocoa good time excellent prod...
195785,One of my favorites.....hopefully this continu...,Positive,Amazon,one favoriteshopefully continues reasonable pr...
381426,I thought that this was a very tasty treat. V...,Positive,Amazon,thought tasty treat comparable nongluten free ...


In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Encode sentiment labels to numbers
label_encoder = LabelEncoder()
combined['label'] = label_encoder.fit_transform(combined['sentiment'])

# Show encoding mapping
label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_map)

# TF-IDF vectorizer to convert text to numerical features
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(combined['clean_text'])
y = combined['label']

# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression classifier
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

# Display the evaluation results
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Label Mapping: {'Negative': np.int64(0), 'Neutral': np.int64(1), 'Positive': np.int64(2)}

Classification Report:
              precision    recall  f1-score   support

    Negative       0.74      0.67      0.71     16556
     Neutral       0.51      0.18      0.27      8643
    Positive       0.90      0.97      0.93     89232

    accuracy                           0.87    114431
   macro avg       0.72      0.61      0.63    114431
weighted avg       0.84      0.87      0.85    114431



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Apply class weights to handle imbalance (Neutral class gets more weight)
model_weighted = LogisticRegression(class_weight='balanced', max_iter=1000)
model_weighted.fit(X_train, y_train)

# Predict and evaluate
y_pred_weighted = model_weighted.predict(X_test)

# Show updated performance
print("\nWeighted Logistic Regression:")
print(classification_report(y_test, y_pred_weighted, target_names=label_encoder.classes_))



Weighted Logistic Regression:
              precision    recall  f1-score   support

    Negative       0.63      0.74      0.68     16556
     Neutral       0.26      0.63      0.37      8643
    Positive       0.97      0.81      0.88     89232

    accuracy                           0.78    114431
   macro avg       0.62      0.73      0.64    114431
weighted avg       0.87      0.78      0.81    114431



In [11]:
from sklearn.naive_bayes import MultinomialNB

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict
y_pred_nb = nb_model.predict(X_test)

# Evaluate
print("\nNaive Bayes Model:")
print(classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_))



Naive Bayes Model:
              precision    recall  f1-score   support

    Negative       0.83      0.27      0.41     16556
     Neutral       0.53      0.00      0.01      8643
    Positive       0.81      1.00      0.90     89232

    accuracy                           0.82    114431
   macro avg       0.73      0.42      0.44    114431
weighted avg       0.80      0.82      0.76    114431



In [12]:
import json

results = {
    "Logistic Regression (Weighted)": {
        "report": classification_report(y_test, y_pred_weighted, target_names=label_encoder.classes_),
        "conf_matrix": confusion_matrix(y_test, y_pred_weighted).tolist()
    },
    "Naive Bayes": {
        "report": classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_),
        "conf_matrix": confusion_matrix(y_test, y_pred_nb).tolist()
    }
}

# Save to file
with open("model_results.json", "w") as f:
    json.dump(results, f)

# Download
from google.colab import files
files.download("model_results.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>