In [None]:
!pip install nltk==3.8.1 scikit-learn==1.4.2

Collecting scikit-learn==1.4.2
  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.2
    Uninstalling scikit-learn-1.3.2:
      Successfully uninstalled scikit-learn-1.3.2
Successfully installed scikit-learn-1.4.2


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import re
# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Load the data
data = pd.read_csv('amazon_alexa.tsv', sep='\t')

# Preprocess the text data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

data['processed_reviews'] = data['verified_reviews'].apply(preprocess_text)

# Prepare the features and target
X = data[['processed_reviews', 'variation']]
y = data['feedback']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the feature extraction pipeline
feature_extraction = ColumnTransformer([
    ('count', CountVectorizer(max_features=2000), 'processed_reviews'),
    ('onehot', OneHotEncoder(handle_unknown='ignore'), ['variation']),
], remainder='passthrough')

# Create pipelines for different models with SMOTE
pipelines = {
    'Naive Bayes': ImbPipeline([
        ('features', feature_extraction),
        ('smote', SMOTE(random_state=42)),
        ('clf', MultinomialNB())
    ]),
    'Logistic Regression': ImbPipeline([
        ('features', feature_extraction),
        ('smote', SMOTE(random_state=42)),
        ('clf', LogisticRegression(random_state=42))
    ]),
    'SVM': ImbPipeline([
        ('features', feature_extraction),
        ('smote', SMOTE(random_state=42)),
        ('clf', SVC(kernel='linear', random_state=42))
    ]),
    'Random Forest': ImbPipeline([
        ('features', feature_extraction),
        ('smote', SMOTE(random_state=42)),
        ('clf', RandomForestClassifier(max_features=8, criterion = 'entropy', n_estimators=200, random_state=42))
    ]),
    'XGBoost': ImbPipeline([
        ('features', feature_extraction),
        ('smote', SMOTE(random_state=42)),
        ('clf', XGBClassifier(max_depth=3, n_estimators=100, random_state=42))
    ])
}

# Train and evaluate models
results = {}
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print()

# Select the best model
best_model = max(results, key=results.get)
print(f"Best model: {best_model} with accuracy: {results[best_model]:.4f}")


# Save the best model
import joblib
joblib.dump(pipelines[best_model], 'best_sentiment_model_with_variations.joblib')

# Inference function
def predict_sentiment(text, variation, rating, model):
    processed_text = preprocess_text(text)
    df = pd.DataFrame({
        'processed_reviews': [processed_text],
        'variation': [variation],
        'rating': [rating]
    })
    sentiment = model.predict(df)[0]
    return 'Positive' if sentiment == 1 else 'Negative'

# Load the best model
print("\nLoading the best model...")
best_model = joblib.load('best_sentiment_model_with_variations.joblib')
# Evaluation on new data
new_texts = [
    "Exceptional items",
    "Incredible merchandise",
    "Top-notch offerings",
    "Subpar item",
    "Poorly made item",
    "Low-quality goods."
]
variations = ["Black  Dot", "White  Dot", "Black  Dot", "White  Dot", "Black  Dot", "White  Dot"]
ratings = [5, 5, 5, 2, 1, 1]
true_labels = [1, 1, 1, 0, 0, 0]

data = {
    'text': new_texts,
    'variation': variations,
    'rating': ratings,
    'true_label': true_labels
}
df = pd.DataFrame(data)

print("\nMaking predictions on new data...")
# Make predictions
df['predicted_sentiment'] = df.apply(lambda row: predict_sentiment(row['text'], row['variation'], row['rating'], best_model), axis=1)
df['predicted_label'] = df['predicted_sentiment'].map({'Positive': 1, 'Negative': 0})

# Print results
print("\nResults:")
print(df)
print("\nAccuracy:", accuracy_score(df['true_label'], df['predicted_label']))
print("\nClassification Report:")
print(classification_report(df['true_label'], df['predicted_label'], target_names=['Negative', 'Positive']))

# Print misclassified samples, if any
misclassified = df[df['true_label'] != df['predicted_label']]
if not misclassified.empty:
    print("\nMisclassified samples:")
    print(misclassified[['text', 'variation', 'rating', 'true_label', 'predicted_sentiment']])
else:
    print("\nAll samples were correctly classified!")

Naive Bayes Accuracy: 0.8810
              precision    recall  f1-score   support

           0       0.39      0.50      0.44        58
           1       0.95      0.92      0.93       572

    accuracy                           0.88       630
   macro avg       0.67      0.71      0.68       630
weighted avg       0.90      0.88      0.89       630


Logistic Regression Accuracy: 0.8905
              precision    recall  f1-score   support

           0       0.44      0.74      0.55        58
           1       0.97      0.91      0.94       572

    accuracy                           0.89       630
   macro avg       0.71      0.82      0.75       630
weighted avg       0.92      0.89      0.90       630


SVM Accuracy: 0.8889
              precision    recall  f1-score   support

           0       0.43      0.69      0.53        58
           1       0.97      0.91      0.94       572

    accuracy                           0.89       630
   macro avg       0.70      0.80      

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import re
# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [None]:
# Load the data
data = pd.read_csv('amazon_alexa.tsv', sep='\t')

# Preprocess the text data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

data['processed_reviews'] = data['verified_reviews'].apply(preprocess_text)

In [None]:
# Prepare the features and target
X = data[['processed_reviews', 'variation']]
y = data['feedback']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Create the feature extraction pipeline
feature_extraction = ColumnTransformer([
    ('count', CountVectorizer(max_features=2000), 'processed_reviews'),
    ('onehot', OneHotEncoder(handle_unknown='ignore'), ['variation']),
], remainder='passthrough')


In [None]:
# Create pipelines for different models with SMOTE
pipelines = {
    'Naive Bayes': ImbPipeline([
        ('features', feature_extraction),
        ('smote', SMOTE(random_state=42)),
        ('clf', MultinomialNB())
    ]),
    'Logistic Regression': ImbPipeline([
        ('features', feature_extraction),
        ('smote', SMOTE(random_state=42)),
        ('clf', LogisticRegression(random_state=42))
    ]),
    'SVM': ImbPipeline([
        ('features', feature_extraction),
        ('smote', SMOTE(random_state=42)),
        ('clf', SVC(kernel='linear', random_state=42))
    ]),
    'Random Forest': ImbPipeline([
        ('features', feature_extraction),
        ('smote', SMOTE(random_state=42)),
        ('clf', RandomForestClassifier(max_features=8, criterion = 'entropy', n_estimators=200, random_state=42))
    ]),
    'XGBoost': ImbPipeline([
        ('features', feature_extraction),
        ('smote', SMOTE(random_state=42)),
        ('clf', XGBClassifier(max_depth=3, n_estimators=100, random_state=42))
    ])
}

In [None]:
# Train and evaluate models
results = {}
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print()

Naive Bayes Accuracy: 0.8810
              precision    recall  f1-score   support

           0       0.39      0.50      0.44        58
           1       0.95      0.92      0.93       572

    accuracy                           0.88       630
   macro avg       0.67      0.71      0.68       630
weighted avg       0.90      0.88      0.89       630


Logistic Regression Accuracy: 0.8905
              precision    recall  f1-score   support

           0       0.44      0.74      0.55        58
           1       0.97      0.91      0.94       572

    accuracy                           0.89       630
   macro avg       0.71      0.82      0.75       630
weighted avg       0.92      0.89      0.90       630


SVM Accuracy: 0.8889
              precision    recall  f1-score   support

           0       0.43      0.69      0.53        58
           1       0.97      0.91      0.94       572

    accuracy                           0.89       630
   macro avg       0.70      0.80      

In [None]:
# Select the best model
best_model = max(results, key=results.get)
print(f"Best model: {best_model} with accuracy: {results[best_model]:.4f}")


# Save the best model
import joblib
joblib.dump(pipelines[best_model], 'best_sentiment_model_with_variations.joblib')


Best model: Random Forest with accuracy: 0.9302


['best_sentiment_model_with_variations.joblib']

In [None]:
# Inference function
def predict_sentiment(text, variation, rating, model):
    processed_text = preprocess_text(text)
    df = pd.DataFrame({
        'processed_reviews': [processed_text],
        'variation': [variation],
        'rating': [rating]
    })
    sentiment = model.predict(df)[0]
    return 'Positive' if sentiment == 1 else 'Negative'


In [None]:
# Load the best model
print("\nLoading the best model...")
best_model = joblib.load('best_sentiment_model_with_variations.joblib')
# Evaluation on new data
new_texts = [
    "Exceptional items",
    "Incredible merchandise",
    "Top-notch offerings",
    "Subpar item",
    "Poorly made item",
    "Low-quality goods."
]
variations = ["Black  Dot", "White  Dot", "Black  Dot", "White  Dot", "Black  Dot", "White  Dot"]
ratings = [5, 5, 5, 2, 1, 1]
true_labels = [1, 1, 1, 0, 0, 0]

data = {
    'text': new_texts,
    'variation': variations,
    'rating': ratings,
    'true_label': true_labels
}
df = pd.DataFrame(data)

print("\nMaking predictions on new data...")


Loading the best model...

Making predictions on new data...


In [None]:
# Make predictions
df['predicted_sentiment'] = df.apply(lambda row: predict_sentiment(row['text'], row['variation'], row['rating'], best_model), axis=1)
df['predicted_label'] = df['predicted_sentiment'].map({'Positive': 1, 'Negative': 0})

# Print results
print("\nResults:")
print(df)
print("\nAccuracy:", accuracy_score(df['true_label'], df['predicted_label']))
print("\nClassification Report:")
print(classification_report(df['true_label'], df['predicted_label'], target_names=['Negative', 'Positive']))

# Print misclassified samples, if any
misclassified = df[df['true_label'] != df['predicted_label']]
if not misclassified.empty:
    print("\nMisclassified samples:")
    print(misclassified[['text', 'variation', 'rating', 'true_label', 'predicted_sentiment']])
else:
    print("\nAll samples were correctly classified!")


Results:
                     text   variation  rating  true_label predicted_sentiment  \
0       Exceptional items  Black  Dot       5           1            Positive   
1  Incredible merchandise  White  Dot       5           1            Negative   
2     Top-notch offerings  Black  Dot       5           1            Positive   
3             Subpar item  White  Dot       2           0            Negative   
4        Poorly made item  Black  Dot       1           0            Positive   
5      Low-quality goods.  White  Dot       1           0            Negative   

   predicted_label  
0                1  
1                0  
2                1  
3                0  
4                1  
5                0  

Accuracy: 0.6666666666666666

Classification Report:
              precision    recall  f1-score   support

    Negative       0.67      0.67      0.67         3
    Positive       0.67      0.67      0.67         3

    accuracy                           0.67         6
   