In [68]:
#Import Libraries

import pandas as pd
import numpy as np
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import SMOTE

## Load and Inspect Dataset
df = pd.read_csv("sentiment_cleaned_customer_reviews.csv")

# Display basic information
print(df.info())
print(df.head())

#Clean the Review Text
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = word_tokenize(text)  # Tokenize words
    text = ' '.join([word for word in tokens if word not in stop_words])  # Remove stopwords
    return text

# Apply function to cleaned_review column
df['cleaned_review'] = df['cleaned_review'].astype(str).apply(clean_text)

# Check results
print(df[['cleaned_review', 'sentiment_category']].head(10))

#Encode Sentiment Labels

label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment_category'])

# Display mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Encoding Mapping:", label_mapping)

##TF-IDF Vectorization for Logistic Regression
vectorizer = TfidfVectorizer(max_features=5000)  # Keep top 5000 words
X_tfidf = vectorizer.fit_transform(df['cleaned_review']).toarray()
y = df['sentiment_encoded']


# Define keyword-based classification rules
positive_keywords = {'excellent', 'great', 'amazing', 'awesome', 'good', 'love', 'best', 'fantastic', 'wonderful', 'clean', 'warm', 'friendly', 
                    'delight', 'smile', 'authentic', 'awesome', 'fabulous'}
negative_keywords = {'bad', 'horrible', 'awful', 'worst', 'terrible', 'poor', 'hate', 'disappointed', 'wrong', 'watery', 'over priced', 'unhealthy'}
neutral_keywords = {'okay', 'average', 'fine', 'decent', 'satisfactory', 'neutral', 'understandable', 'mild', 'subtle', 'acceptable', 'lukewarm'}

def keyword_classification(text):
    words = set(text.split())  # Convert text into a set of words
    if words & positive_keywords:
        return 'positive'
    elif words & negative_keywords:
        return 'negative'
    elif words & neutral_keywords:
        return 'neutral'
    else:
        return 'unknown'  # If no matching keywords are found

# Apply the keyword classification function
df['keyword_sentiment'] = df['cleaned_review'].apply(keyword_classification)

# Encode the keyword-based sentiment labels
df['keyword_sentiment_encoded'] = label_encoder.fit_transform(df['keyword_sentiment'])

# Compare keyword-based sentiment classification with ML classification
print(df[['cleaned_review', 'sentiment_category', 'keyword_sentiment']].head(10))

# Ensure `keyword_sentiment_encoded` is a NumPy array
keyword_sentiment_encoded = df['keyword_sentiment_encoded'].values.reshape(-1, 1)

# Combine TF-IDF features with keyword sentiment encoding
X_combined = np.hstack((X_tfidf, keyword_sentiment_encoded))

# Train-test split (make sure X_train and X_test have the same number of features)
X_train_combined, X_test_combined, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE (only to training data)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_combined, y_train)


# Initialize and Train Logistic Regression Model
log_model = LogisticRegression()
log_model.fit(X_train_resampled, y_train_resampled)

# Predictions (Ensure X_test_combined is used, not X_test)
y_pred_log_combined = log_model.predict(X_test_combined)

# Evaluation
print("Logistic Regression Performance with Keyword Feature:")
print(classification_report(y_test, y_pred_log_combined))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_combined))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                705 non-null    object 
 1   location            705 non-null    object 
 2   date                705 non-null    object 
 3   rating              705 non-null    float64
 4   review              705 non-null    object 
 5   image_links         705 non-null    object 
 6   cleaned_review      705 non-null    object 
 7   word_count          705 non-null    int64  
 8   char_count          705 non-null    int64  
 9   sentiment_category  705 non-null    object 
 10  normalized_rating   705 non-null    float64
dtypes: float64(2), int64(2), object(7)
memory usage: 60.7+ KB
None
       name           location        date  rating  \
0     Helen  Wichita Falls, TX  2023-09-13     5.0   
1  Courtney         Apopka, FL  2023-07-16     5.0   
2  Daynelle  Cranberry Twp, 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\idehe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\idehe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                      cleaned_review sentiment_category
0  amber ladonna starbucks southwest parkway alwa...           Positive
1  starbucks fire station altamonte springs fl ma...           Positive
2  wanted go way recognize starbucks employee bil...           Positive
3  friend starbucks card didnt work thankful work...           Positive
4  im kick drinking cups warm water work instacar...           Positive
5  correct order times never got right manager ca...           Negative
6  tried starbucks several different times differ...           Negative
7  starbucks near launched new fall foods beverag...           Negative
8  ordered online reisterstown rd st thomas sc ga...           Negative
9  staff smythe st superstore location fredericto...           Negative
Label Encoding Mapping: {'Negative': 0, 'Neutral': 1, 'Positive': 2}
                                      cleaned_review sentiment_category  \
0  amber ladonna starbucks southwest parkway alwa...           P

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [70]:
import joblib

# Save the trained logistic regression model
joblib.dump(log_model, 'sentiment_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Save the Label Encoder (for keyword_sentiment_encoded)
joblib.dump(label_encoder, 'label_encoder.pkl')

print("Model, vectorizer, and label encoder saved successfully!")


Model, vectorizer, and label encoder saved successfully!


In [72]:

# Load the saved model, vectorizer, and label encoder
loaded_model = joblib.load('sentiment_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')
loaded_label_encoder = joblib.load('label_encoder.pkl')

# Define the keyword-based classification function
positive_keywords = {'excellent', 'great', 'amazing', 'awesome', 'good', 'love', 'best', 'fantastic', 'wonderful', 'clean', 'warm', 'friendly', 
                    'delight', 'smile', 'authentic', 'awesome', 'fabulous'}
negative_keywords = {'bad', 'horrible', 'awful', 'worst', 'terrible', 'poor', 'hate', 'disappointed', 'wrong', 'watery', 'over priced', 'unhealthy'}
neutral_keywords = {'okay', 'average', 'fine', 'decent', 'satisfactory', 'neutral', 'understandable', 'mild', 'subtle', 'acceptable', 'lukewarm'}

def keyword_classification(text):
    words = set(text.lower().split())  # Convert text into a set of words
    if words & positive_keywords:
        return 'positive'
    elif words & negative_keywords:
        return 'negative'
    elif words & neutral_keywords:
        return 'neutral'
    else:
        return 'unknown'  # If no matching keywords are found

# Example prediction
sample_review = ["The product quality is excellent and I love it!"]

# Vectorize the input text
sample_vectorized = loaded_vectorizer.transform(sample_review).toarray()

# Apply keyword classification
sample_keyword_sentiment = keyword_classification(sample_review[0])

# Encode the keyword-based classification
sample_keyword_encoded = loaded_label_encoder.transform([sample_keyword_sentiment])[0]

# Combine both features
sample_combined = np.hstack((sample_vectorized, np.array([[sample_keyword_encoded]])))

# Make a prediction
prediction = loaded_model.predict(sample_combined)

print("Predicted Sentiment:", prediction[0])


Predicted Sentiment: 0


In [74]:
!python sentiment_app.py

^C


In [78]:
import requests

url = "http://127.0.0.1:5000/predict"
data = {"review": "The coffee is amazing, and the customer service is excellent. I love it."}

response = requests.post(url, json=data)
print(response.json())  # Should return {"sentiment": "2"}


{'sentiment': 2}
