In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('/Users/drey/Downloads/User Experience Spectrum Task - Annotated_data (1).csv')

In [3]:
df.head()

Unnamed: 0,Randomizer,reviewerID,asin,reviewerName,helpful,overall,reviewTime,summary,reviewText,User Experience Spectrum
0,1,A1NAA1R38JSNHV,B0002F7IIK,Josh Leger,"[0, 0]",5,1/4/2013,"works wonders, very durable",Will hold any guitar that I've seen. It holds ...,Thrilled
1,2,ADH0O8UVJOT10,B007Q27BH0,StormJH1,"[3, 3]",5,12/31/2012,Another instant classic from Joyo,"The US Dream is supposed ""clone"" of the Suhr R...",Insightful Feedback
2,4,A2E3Q52SJS00K2,B004N0MKN8,Wendell Burnett,"[0, 0]",5,1/3/2014,Great Idea,Those of us who have leaned their guitars agai...,Thrilled
3,6,A4BTCECGQAIUI,B0002M6CVC,Amazon Customer,"[0, 2]",4,5/30/2013,Good,"Good Strings, I buy this Strings from a few ye...",Thrilled
4,7,A7IBOCJ0K4V8C,B003VWJ2K8,J. Walker,"[0, 0]",5,5/6/2013,It works.,This tuner can be adjusted to be very easy to ...,Insightful Feedback


### EDA (Exploratory Data Analysis)

In [4]:
df.drop(['Randomizer'], axis=1, inplace=True)

In [5]:
df.tail(10)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,overall,reviewTime,summary,reviewText,User Experience Spectrum
150,A22Z554ZQ8NFPC,B0002D0CKI,"AF ""Whigs""","[12, 12]",3,9/8/2011,Good & Bad,"First, I love these picks and used them exclus...",Mixed Emotions
151,A1B9Q3SNKI6T5V,B0002D0LKY,Dustin Kempton,"[0, 0]",1,5/2/2014,I really do hate it.,"It just randomly pops off my bass, it's so sli...",Opportunity Areas
152,A3NAA6BH9LWIH4,B001UJEKZ6,Paul Kacprzak,"[0, 2]",1,5/15/2013,I hate it.,The coversion cable didnt work. t is axlr to U...,Opportunity Areas
153,A1EQYR35KLTECN,B0002E1H9W,animulvr,"[0, 1]",3,1/6/2013,Hate the pushdown bottle heads!,I like Dunlop products but... The push down ap...,Opportunity Areas
154,A1QRF5KISDOKPA,B000SAC5PA,santos,"[6, 8]",1,12/23/2011,hate it,ok when i saw the bag i was impressed nice loo...,Opportunity Areas
155,A1VW19Y79DC0GF,B00186L9X2,"Andrew M. Ward ""My Stolen Life""","[13, 16]",3,8/31/2011,It's a Love / Hate Relationship...,It cannot be denied that the BOSS DD-7 is a ve...,Mixed Emotions
156,A15BHBF0L0HV1F,B0002D0E8S,"Quaestor ""Raoul Duke""","[0, 0]",3,11/30/2013,Nothing Special,Nothing special. Just like every other strap o...,Mixed Emotions
157,A3DDZ2SENG07MS,B003LTJ404,dashreeve,"[0, 0]",4,1/6/2012,Good stand - not the best for gigging / travel,"The stand seems sturdy, the base is like a tri...",Insightful Feedback
158,A1EFXXRDV40C4E,B003SZDFM4,Charles Casterline,"[2, 2]",3,12/16/2012,Nothing to brag about,I use to have a guitar that sounded horrible w...,Opportunity Areas
159,A1FCX548TD6DLP,B003QTM9O2,Cooper the Beagle,"[0, 0]",1,1/9/2014,"Poorly Made, Flimsy. Buy Another Product","At the time I bought, was $16. Mine arrived br...",Opportunity Areas


In [6]:
missing_values = df.isna().sum() * 100 / df.shape[0]
missing_values.sort_values(ascending=False)
missing_values
# missing_values[missing_values > 0].sort_values(ascending=True)

reviewerID                  0.0
asin                        0.0
reviewerName                0.0
helpful                     0.0
overall                     0.0
reviewTime                  0.0
summary                     0.0
reviewText                  0.0
User Experience Spectrum    0.0
dtype: float64

# Feature Engineering

In [7]:
# Define numerical labels for User Experience Spectrum
spectrum_labels = {'Thrilled': 0, 'Insightful Feedback': 1, 'Mixed Emotions': 2, 'Opportunity Areas': 3}

# Apply labels to the 'user_experience_spectrum' column
df['user_experience_labels'] = df['User Experience Spectrum'].map(spectrum_labels)


In [8]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/drey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/drey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
en_stopwords=set(stopwords.words('english'))

In [10]:
tokenizer = RegexpTokenizer(r'\w+')

# Tokenization
df['tokenized_text'] = df['reviewText'].apply(tokenizer.tokenize)

# Stopword removal
stop_words = set(stopwords.words('english'))
df['filtered_tokens'] = df['tokenized_text'].apply(lambda tokens: [word for word in tokens if word.lower() not in en_stopwords])

# Stemming
ps = PorterStemmer()
df['stemmed_tokens'] = df['filtered_tokens'].apply(lambda tokens: [ps.stem(word) for word in tokens])

# Combine preprocessed tokens
df['processed_text'] = df['stemmed_tokens'].apply(lambda tokens: ' '.join(tokens))


In [11]:
# Tokenization for 'summary'
df['tokenized_summary'] = df['summary'].apply(tokenizer.tokenize)

# Stopword removal for 'summary'
df['filtered_summary_tokens'] = df['tokenized_summary'].apply(lambda tokens: [word for word in tokens if word.lower() not in en_stopwords])

# Stemming for 'summary'
df['stemmed_summary_tokens'] = df['filtered_summary_tokens'].apply(lambda tokens: [ps.stem(word) for word in tokens])

# Combine preprocessed tokens for 'summary'
df['processed_summary'] = df['stemmed_summary_tokens'].apply(lambda tokens: ' '.join(tokens))

# Concatenate 'summary' and 'reviewText' for a holistic input
df['combined_text'] = df['processed_text'] + ' ' + df['processed_summary']

# Train Data

In [12]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['combined_text'], df['user_experience_labels'], test_size=0.2, random_state=42)

In [13]:
#Exclude Nan values from training
df.dropna(subset=['user_experience_labels'], inplace=True)

In [14]:
# Bag of Words (BoW) vectorization
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [15]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [16]:
# Initialize and train different models
models = {
    'SVM': SVC(kernel='linear', C=1),
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))


SVM Accuracy: 0.6875
              precision    recall  f1-score   support

           0       0.69      1.00      0.81        22
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         2

    accuracy                           0.69        32
   macro avg       0.17      0.25      0.20        32
weighted avg       0.47      0.69      0.56        32


Logistic Regression Accuracy: 0.6875
              precision    recall  f1-score   support

           0       0.69      1.00      0.81        22
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         2

    accuracy                           0.69        32
   macro avg       0.17      0.25      0.20        32
weighted avg       0.47      0.69      0.56        32


Naive Bayes Accuracy: 0.6875
              precision    recall  f1-sc

In [17]:
# Define class weights
class_weights = {0: .5, 1: 2, 2: 4, 3: 6} 

# By setting class_weight='balanced', the SVM model will automatically 
# adjust the weights based on the inverse of the class frequencies. 
# This can help improve the model's performance on minority classes.

In [18]:
# Initialize SVM model with class weights
svm_model = SVC(kernel='linear', C=4, class_weight=class_weights, probability=True)

# Train the SVM model
svm_model.fit(X_train_tfidf, y_train)

In [19]:
# Evaluate model
y_pred = svm_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:\n', classification_report_str)

Accuracy: 0.6875
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.95      0.81        22
           1       0.50      0.25      0.33         4
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         2

    accuracy                           0.69        32
   macro avg       0.30      0.30      0.29        32
weighted avg       0.54      0.69      0.60        32



In [20]:
!pip install joblib



In [21]:
import joblib

# joblib.dump(en_stopwords,'stopwords.pkl') 
joblib.dump(svm_model,'model.pkl', compress=('zlib', 3))
# joblib.dump(tfidf_vectorizer,'vectorizer.pkl')

['model.pkl']

In [22]:
# # Define numerical labels for User Experience Spectrum
# spectrum_labels = {'Thrilled': 0, 'Insightful Feedback': 1, 'Mixed Emotions': 2, 'Opportunity Areas': 3}

# # Apply labels to the 'user_experience_spectrum' column
# df['user_experience_labels'] = df['User Experience Spectrum'].map(spectrum_labels)

# en_stopwords=set(stopwords.words('english'))
# tokenizer = RegexpTokenizer(r'\w+')

# # Tokenization
# df['tokenized_text'] = df['reviewText'].apply(tokenizer.tokenize)

# # Stopword removal
# stop_words = set(stopwords.words('english'))
# df['filtered_tokens'] = df['tokenized_text'].apply(lambda tokens: [word for word in tokens if word.lower() not in en_stopwords])

# # Stemming
# ps = PorterStemmer()
# df['stemmed_tokens'] = df['filtered_tokens'].apply(lambda tokens: [ps.stem(word) for word in tokens])

# # Combine preprocessed tokens
# df['processed_text'] = df['stemmed_tokens'].apply(lambda tokens: ' '.join(tokens))

# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['user_experience_labels'], test_size=0.2, random_state=42)

# #Exclude Nan values from training
# df.dropna(subset=['user_experience_labels'], inplace=True)


# # TF-IDF vectorization
# tfidf_vectorizer = TfidfVectorizer()
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# X_test_tfidf = tfidf_vectorizer.transform(X_test)


# # Initialize and train different models
# models = {
#     'SVM': SVC(kernel='linear', C=1),
#     'Logistic Regression': LogisticRegression(),
#     'Naive Bayes': MultinomialNB(),
#     'Random Forest': RandomForestClassifier(),
#     'Gradient Boosting': GradientBoostingClassifier()
# }

# for name, model in models.items():
#     model.fit(X_train_tfidf, y_train)
#     y_pred = model.predict(X_test_tfidf)
    
#     accuracy = accuracy_score(y_test, y_pred)
#     print(f"\n{name} Accuracy: {accuracy:.4f}")
#     print(classification_report(y_test, y_pred))

# # Define class weights
# class_weights = {0: .5, 1: 2, 2: 4, 3: 6} 

# # Initialize SVM model with class weights
# svm_model = SVC(kernel='linear', C=4, class_weight=class_weights, probability=True)

# # Train the SVM model
# svm_model.fit(X_train_tfidf, y_train)

# # Evaluate model
# y_pred = svm_model.predict(X_test_tfidf)
# accuracy = accuracy_score(y_test, y_pred)
# classification_report_str = classification_report(y_test, y_pred)
# print(f'Accuracy: {accuracy}')
# print('Classification Report:\n', classification_report_str)

