In [None]:
## This will connect your Google drive with your Colab
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
!pip install vaderSentiment



## Import necessary packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, f1_score, accuracy_score


## Reading in train and test files

In [None]:
# import the data into a dataframe df
df = pd.read_csv('/content/drive/Shareddrives/CS74/Project3/amazon_train.csv')

test_df = pd.read_csv('/content/drive/Shareddrives/CS74/Project3/amazon_test.csv')

excluded_col = ['reviewTime', 'reviewerID', 'summary', 'reviewerName', 'unixReviewTime', 'image', 'style', 'asin', 'vote', 'category', 'verified']
text_col = 'reviewText'

df['reviewText'] = df['reviewText'] + " " + df['summary']
test_df['reviewText'] = test_df['reviewText'] + " " + test_df['summary']


#drop excluded columns
df = df.drop(columns=excluded_col)
test_df = test_df.drop(columns=excluded_col)

#handle NaN values
df[text_col].fillna('', inplace=True)
test_df[text_col].fillna('', inplace=True)

# df['vote'].fillna(0, inplace=True)
# test_df['vote'].fillna(0, inplace=True)


y_train = df['overall']
df = df.drop('overall', axis=1)
print(df.columns)
print(df.head())
print(y_train)

Index(['reviewText', 'id'], dtype='object')
                                          reviewText   id
0  all of the reviews for this product are fake. ...  ab0
1                    wrong part. our fault. One Star  ab1
2          this wire set it really sucks!!! One Star  ab2
3  first use, it leaked instantly. even at 5 buck...  ab3
4                                didn't fit One Star  ab4
0        1
1        1
2        1
3        1
4        1
        ..
29184    5
29185    5
29186    5
29187    5
29188    5
Name: overall, Length: 29189, dtype: int64


## Text preprocessing

We use a TFIDF vectorizer to preprocess our text data. This means that we need to separate the review text into individual words, and then have columns in the dataframe for each unique word, with a TFIDF score

In [None]:
vectorizer = TfidfVectorizer(max_features=1500, ngram_range=(1, 2))

word_counts = vectorizer.fit_transform(df[text_col].tolist())
word_counts_df = pd.DataFrame(word_counts.toarray(), columns=vectorizer.get_feature_names_out())
text_df = pd.concat([df, word_counts_df], axis=1)

test_word_counts = vectorizer.fit_transform(test_df[text_col].tolist())
test_word_counts_df = pd.DataFrame(test_word_counts.toarray(), columns=vectorizer.get_feature_names_out())
test_text_df = pd.concat([test_df, test_word_counts_df], axis=1)

## Cleaning up training data


In [None]:
X_train = text_df.drop(text_col, axis=1)
X_train = X_train.drop('id', axis=1)
#y_train defined earlier

X_test = test_text_df.drop(text_col, axis=1)
X_test = X_test.drop('id', axis=1)

#merge missing columns

all_col = X_test.columns.tolist() + X_train.columns.tolist()
X_train = X_train.reindex(columns=all_col, fill_value=0)
X_test = X_test.reindex(columns=all_col, fill_value=0)

print(X_train.head(), X_test.head())

    10  100   11   12   15  16   20   30   50  70  ...  you need  you pay  \
0  0.0  0.0  0.0  0.0  0.0   0  0.0  0.0  0.0   0  ...       0.0      0.0   
1  0.0  0.0  0.0  0.0  0.0   0  0.0  0.0  0.0   0  ...       0.0      0.0   
2  0.0  0.0  0.0  0.0  0.0   0  0.0  0.0  0.0   0  ...       0.0      0.0   
3  0.0  0.0  0.0  0.0  0.0   0  0.0  0.0  0.0   0  ...       0.0      0.0   
4  0.0  0.0  0.0  0.0  0.0   0  0.0  0.0  0.0   0  ...       0.0      0.0   

   you re  you want  you will  young  your  your money  your phone  yourself  
0     0.0       0.0       0.0    0.0   0.0         0.0         0.0       0.0  
1     0.0       0.0       0.0    0.0   0.0         0.0         0.0       0.0  
2     0.0       0.0       0.0    0.0   0.0         0.0         0.0       0.0  
3     0.0       0.0       0.0    0.0   0.0         0.0         0.0       0.0  
4     0.0       0.0       0.0    0.0   0.0         0.0         0.0       0.0  

[5 rows x 3000 columns]     10  100   11   12   15   16   20  

## Cross validation of different models

### Logistic Regression

In [None]:
# this model was used for testing, trains the model in increments of data

model1 = LogisticRegression(C=.5, max_iter=900, multi_class='multinomial', warm_start=True, solver='saga')

chunk = 15000
num_chunks = len(X_train) // chunk
for i in range(num_chunks):
    start = i * chunk
    end = (i + 1) * chunk

    X_chunk = X_train[start:end]
    y_chunk = y_train[start:end]

    # Update the model with the current chunk
    model1.fit(X_chunk, y_chunk)
predictions = model1.predict(X_test)

submission = pd.DataFrame({'pred': predictions, 'id': test_df['id']})

submission.to_csv('output.csv', index=False)

In [None]:
model1 = LogisticRegression(C=0.5, max_iter=900, multi_class='multinomial', warm_start=True, solver='saga')

model1.fit(X_train, y_train)
predictions = model1.predict(X_test)

submission = pd.DataFrame({'pred': predictions, 'id': test_df['id']})

submission.to_csv('/content/drive/Shareddrives/CS74/Project3/output.csv', index=False)


predicted1 = cross_val_predict(model1, X_train, y_train, cv=5)
# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_train, predicted1)

# Calculate macro F1 score
macro_f1 = f1_score(y_train, predicted1, average='macro')

# Calculate accuracy
accuracy = accuracy_score(y_train, predicted1)

print(conf_matrix, macro_f1, accuracy)



[[4084 1169  361  164  179]
 [1493 2815 1056  397  198]
 [ 526 1121 2891 1037  287]
 [ 246  433  941 3064 1085]
 [ 217  210  246  942 4027]] 0.577291909045036 0.5783343040186372


In [None]:
predicted1 = cross_val_predict(model1, X_train, y_train, cv=5, method='predict_proba')

#  ROC AUC
roc_auc = roc_auc_score(y_train, predicted1, multi_class='ovr', average='macro')
print(roc_auc)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'solver': ['lbfgs', 'liblinear'],
    'C': [ .1, 1],
    'warm_start': [True, False],
    'max_iter': [800],
}
# param_grid = {
#     'solver': ['lbfgs', 'saga', 'liblinear'],
#     'C': [.0001, .001, .01, .1, 1, 10],
#     'warm_start': [True, False],
#     'max_iter': [800],
# }
grid_search = GridSearchCV(model1, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
print(best_params)
print("-----")
print(best_model)



### Naive Bayes

In [None]:
model2 = GaussianNB()
model2.fit(X_train, y_train)
predictions = model2.predict(X_test)

submission = pd.DataFrame({'preds': predictions, 'id': test_df['id']})

submission.to_csv('/content/drive/Shareddrives/CS74/Project2/output1.csv', index=False)

predicted2 = cross_val_predict(model2, X_train, y_train, cv=5)
# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_train, predicted2)

# Calculate macro F1 score
macro_f1 = f1_score(y_train, predicted2, average='macro')

# Calculate accuracy
accuracy = accuracy_score(y_train, predicted2)

print(conf_matrix, macro_f1, accuracy)

[[3549 1283  476  342  307]
 [1489 2470 1056  603  341]
 [ 630 1168 2409 1142  513]
 [ 337  529  874 2739 1290]
 [ 296  286  333 1036 3691]] 0.5072961369560705 0.5090273733255678


In [None]:
predicted2 = cross_val_predict(model2, X_train, y_train, cv=5, method='predict_proba')

#  ROC AUC
roc_auc = roc_auc_score(y_train, predicted2, multi_class='ovr', average='macro')
print(roc_auc)

0.7828898522145361


### KNearestNeighbors

In [None]:
model3 = KNeighborsClassifier(algorithm='auto', n_neighbors=7, weights='uniform')
model3.fit(X_train, y_train)

predicted3 = cross_val_predict(model3, X_train, y_train, cv=5)
# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_train, predicted3)

# Calculate macro F1 score
macro_f1 = f1_score(y_train, predicted3, average='macro')

# Calculate accuracy
accuracy = accuracy_score(y_train, predicted3)

print(conf_matrix, macro_f1, accuracy)

[[2156  169   72  165 3395]
 [1230  939  100  209 3481]
 [1064  206  928  376 3288]
 [ 866  211  121 1404 3167]
 [ 748   85   71  375 4363]] 0.31571633758748413 0.33540032203912434


In [None]:
predicted3 = cross_val_predict(model3, X_train, y_train, cv=5, method='predict_proba')

#  ROC AUC
roc_auc = roc_auc_score(y_train, predicted3, multi_class='ovr', average='macro')
print(roc_auc)

0.6469053132375018


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_neighbors': [4, 5, 6, 7],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'kd_tree', 'ball_tree', 'brute']
}

grid_search2 = GridSearchCV(model3, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [None]:
grid_search2.fit(X_train, y_train)

In [None]:
best_params2 = grid_search2.best_params_
best_model2 = grid_search2.best_estimator_

In [None]:
print(best_params2)
print("-----")
print(best_model2)

{'algorithm': 'auto', 'n_neighbors': 7, 'weights': 'uniform'}
-----
KNeighborsClassifier(n_neighbors=7)
