<a href="https://colab.research.google.com/github/Asoub2001/saas-review-dashboard/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# @title Default title text
import pandas as pd

# Load Amazon reviews dataset
amazon = pd.read_csv('/Reviews.csv')

# Show available columns to inspect
print("Amazon columns:", amazon.columns)
amazon[['Score', 'Text']].head()




Amazon columns: Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')


Unnamed: 0,Score,Text
0,5.0,I have bought several of the Vitality canned d...
1,1.0,Product arrived labeled as Jumbo Salted Peanut...
2,4.0,This is a confection that has been around a fe...
3,2.0,If you are looking for the secret ingredient i...
4,5.0,Great taffy at a great price. There was a wid...


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [8]:
# Keep only reviews with text and score
amazon = amazon[['Score', 'Text']].dropna()

# Filter out weird scores (just keep 1 to 5)
amazon = amazon[amazon['Score'].isin([1, 2, 3, 4, 5])]

# Create sentiment column
amazon['sentiment'] = amazon['Score'].apply(
    lambda x: 'Negative' if x <= 2 else ('Neutral' if x == 3 else 'Positive')
)

# Rename for consistency
amazon = amazon.rename(columns={'Text': 'review_text'})
amazon['source'] = 'Amazon'

# Final clean Amazon
amazon_final = amazon[['review_text', 'sentiment', 'source']]

# Show a preview
amazon_final.sample(5)


Unnamed: 0,review_text,sentiment,source
104480,"The texture of this chip is firm, solid, subst...",Positive,Amazon
103014,We use it at night to lower diabetes sugar and...,Positive,Amazon
73107,We love Steaz green tea with peach at our hous...,Negative,Amazon
93497,Sooo bland. Was excited to try this but sadly...,Negative,Amazon
47353,It was supposed to be whole cashews and I assu...,Negative,Amazon


In [11]:
# Load Trustpilot reviews
trust = pd.read_csv('/trust_pilot_reviews_data_2022_06.csv')

# Inspect available columns
print("Trustpilot columns:", trust.columns)
trust[['review_text', 'rating']].head()


Trustpilot columns: Index(['name', 'company_url', 'trustpilot_url', 'description', 'author_name',
       'review_title', 'review_text', 'rating', 'reviewed_at', 'uniq_id',
       'scraped_at'],
      dtype='object')


Unnamed: 0,review_text,rating
0,Lowest price in the country for what we wanted...,5
1,got it on time they took notice that I let the...,5
2,"Delivered on time. Products as described, pack...",5
3,Quick and efficient service..Pleased,5
4,Excellent service packed well. Really pleased.,5


In [14]:
# Clean Trustpilot dataset
trust = trust[['review_text', 'rating']].dropna()

# Assign sentiment
trust['sentiment'] = trust['rating'].apply(
    lambda x: 'Negative' if x <= 2 else ('Neutral' if x == 3 else 'Positive')
)
trust['source'] = 'Trustpilot'

# Final clean
trust_final = trust[['review_text', 'sentiment', 'source']]

# Show a preview
trust_final.sample(5)


Unnamed: 0,review_text,sentiment,source
2555,No follow up or actual customer service.If you...,Negative,Trustpilot
1461,Gave another assingment and very happy with th...,Positive,Trustpilot
352,Very easy to buy from and quick delivery,Positive,Trustpilot
2625,"My 4 year old thoroughly enjoys his class, alt...",Positive,Trustpilot
905,Wonder support and service throughout my exper...,Positive,Trustpilot


In [15]:
# Combine both
combined = pd.concat([amazon_final, trust_final], ignore_index=True)

# Save to file
combined.to_csv('combined_reviews.csv', index=False)


# Show a preview
combined.sample(5)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,review_text,sentiment,source
113839,I feel this product has great packaging to hel...,Positive,Amazon
38034,"The spice tastes like it's 80% coriander, and ...",Negative,Amazon
34188,I have been using this product -- Amore Sun-dr...,Positive,Amazon
125123,I have to agree with the majority here. The n...,Negative,Amazon
79718,An overwhelming smell and taste of preservativ...,Negative,Amazon


In [16]:
import re

# Define a function to clean the review text
def clean_text(text):
    text = text.lower()  # Convert all text to lowercase to standardize
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove numbers and punctuation, keep letters and spaces only
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space and strip leading/trailing spaces
    return text

# Apply the text cleaning function to the review_text column
# First, convert to string type to avoid issues with missing data
combined['clean_text'] = combined['review_text'].astype(str).apply(clean_text)



In [18]:
# Import necessary library for text preprocessing
import nltk

# Download stopwords (common words like "the", "and", etc.)
nltk.download('stopwords')

# Import the list of English stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Define a function to remove stopwords from a sentence
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

# Apply the stopword removal to the 'clean_text' column
combined['clean_text'] = combined['clean_text'].apply(remove_stopwords)

# Save the cleaned and preprocessed dataset to a CSV file
combined.to_csv('cleaned_combined_reviews.csv', index=False)

# Download the cleaned CSV file
from google.colab import files
files.download('cleaned_combined_reviews.csv')

# Show a preview
combined.sample(5)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,review_text,sentiment,source,clean_text
126841,Demi-Glace is hard to find around where I live...,Positive,Amazon,demiglace hard find around live usually costs ...
113938,I'm eating this right now and I don't think I ...,Negative,Amazon,im eating right dont think finish tastes nothi...
102447,Our dog loves it. Easy for her to pick up this...,Positive,Amazon,dog loves easy pick food allergic reaction pre...
34064,This is decent tasting. The cottage cheese le...,Neutral,Amazon,decent tasting cottage cheese leaves lot desir...
130101,Part of the order arrived smashed. Needed it f...,Negative,Trustpilot,part order arrived smashed needed following da...


In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Encode sentiment labels to numbers
label_encoder = LabelEncoder()
combined['label'] = label_encoder.fit_transform(combined['sentiment'])

# Show encoding mapping
label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_map)

# TF-IDF vectorizer to convert text to numerical features
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(combined['clean_text'])
y = combined['label']

# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression classifier
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

# Display the evaluation results
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Label Mapping: {'Negative': np.int64(0), 'Neutral': np.int64(1), 'Positive': np.int64(2)}

Classification Report:
              precision    recall  f1-score   support

    Negative       0.74      0.63      0.68      3980
     Neutral       0.49      0.17      0.25      2073
    Positive       0.89      0.97      0.92     20704

    accuracy                           0.86     26757
   macro avg       0.71      0.59      0.62     26757
weighted avg       0.83      0.86      0.84     26757



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Apply class weights to handle imbalance (Neutral class gets more weight)
model_weighted = LogisticRegression(class_weight='balanced', max_iter=1000)
model_weighted.fit(X_train, y_train)

# Predict and evaluate
y_pred_weighted = model_weighted.predict(X_test)

# Show updated performance
print("\nWeighted Logistic Regression:")
print(classification_report(y_test, y_pred_weighted, target_names=label_encoder.classes_))



Weighted Logistic Regression:
              precision    recall  f1-score   support

    Negative       0.61      0.72      0.66      3980
     Neutral       0.25      0.59      0.35      2073
    Positive       0.96      0.80      0.87     20704

    accuracy                           0.77     26757
   macro avg       0.61      0.70      0.63     26757
weighted avg       0.85      0.77      0.80     26757



In [21]:
from sklearn.naive_bayes import MultinomialNB

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict
y_pred_nb = nb_model.predict(X_test)

# Evaluate
print("\nNaive Bayes Model:")
print(classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_))



Naive Bayes Model:
              precision    recall  f1-score   support

    Negative       0.85      0.26      0.40      3980
     Neutral       0.43      0.01      0.01      2073
    Positive       0.81      1.00      0.89     20704

    accuracy                           0.81     26757
   macro avg       0.70      0.42      0.43     26757
weighted avg       0.78      0.81      0.75     26757

