## importing libraries

In [116]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import string 
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from nltk.stem import WordNetLemmatizer
import re
from gensim.models import KeyedVectors
# import os
import pathlib

In [117]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Loading datasets 

In [118]:
dataset = pd.read_csv("train_sentiment.csv")
dataset.drop(dataset.columns[0] , axis=1, inplace=True)
dataset.iloc[1200]["review"]

"Ok,Badhiya h..,The attachments could have been a bit longer,This beater is less than half the price of regular branded ones but seems to do the job well.Not sure about heavy duty use but extremely handy for whisking eggs, ice cream, meringue, or light cake batters.This is the exact same beater that seems to be available under many names on this site with prices ranging between 430 to 599 (as of Dec 2022) so if buying pick the cheapest available price like I did.It doesn't come with any warranty, but I took a chance and seems to have paid off.,Ko,Have been using it for months and still it works smoothly. It's very easy to use and clean and is effective. Go for it,It's one of the most Convenient product and multiple use. It's durable and easy to use. I really like this product. 10/10.Must buy it.,Great product! Value for money. Go for it!"

## preprocessing

In [119]:
ls = stopwords.words('english')
regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
url_pattern = re.compile(r'https?://\S+|www\.\S+') # Removing urls from texts
# print(emoji_pattern.sub(r'', text)) # no emoji
def transform(s):
    s= s.lower()
    s = regrex_pattern.sub(r'',s) # removing emojis from strings
    s = url_pattern.sub(' ' , s)
    for item in string.punctuation:
        s = s.replace(item , " ")
    # no emoji
    
    # s = list(map(lambda x : "" if x in ls else x , s) )
    s = s.split()
    
    s = list(filter(lambda x : x not in ls , s))
    s = " ".join(s)
    return s
dataset["review"] = dataset["review"].apply(transform)  
# dataset["review"]  
final_dataset = dataset

    
    

In [120]:
def transformation(s): # Lemmatizing strings
    lemmatizer = WordNetLemmatizer()
    s = s.split()
    s = [lemmatizer.lemmatize(w) for w in s]
    return " ".join(s)
final_dataset["review"] = final_dataset["review"].apply(transformation)
# final_dataset
final_dataset["review"].iloc[1200]


'ok badhiya h attachment could bit longer beater le half price regular branded one seems job well sure heavy duty use extremely handy whisking egg ice cream meringue light cake batter exact beater seems available many name site price ranging 430 599 dec 2022 buying pick cheapest available price like come warranty took chance seems paid ko using month still work smoothly easy use clean effective go one convenient product multiple use durable easy use really like product 10 10 must buy great product value money go'

In [121]:
y = pd.DataFrame(final_dataset["rating"])
y.iloc[1279]["rating"] = 0
# for i in range(len(y)):
#     print(i)
#     float(y.iloc[i])
y["rating"] = y["rating"].apply(lambda x : 0 if float(x)<3 else 1) # changing labels into two classes
# final_dataset.drop(columns=["rating"] , inplace=True)

# y.dtypes

## TF-IDF

In [122]:
X = final_dataset["review"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X.values)
np.unique(y.values)

array([0, 1], dtype=int64)

In [123]:

X_train , X_test , y_train , y_test = train_test_split(X ,  y ,random_state=42 , train_size=0.8)    


## with logistic regression

In [124]:
model = LogisticRegression()
params = {"penalty": ['l1', 'l2'] ,
          "dual": [True , False] , 
          "fit_intercept": [True , False] ,
          "n_jobs": [5,10,15]
          }
grid = GridSearchCV(model , param_grid=params  , cv = 5)
# model.fit(X_train , y_train)
# y_pred = model.predict(X_test)
grid.fit(X_train , y_train)
y_pred = grid.predict(X_test)


f1_score(y_test, y_pred , average="weighted")
# grid = GridSearchCV(estimator=model ,param_grid=[{'penalty':['l1','l2']}, 
#               {'C':[1, 10, 100, 1000]}] , scoring="accuracy" , cv= 5)
# grid.fit(X_train , y_train)
# y_pred_new = grid.predict(X_test)
# f1_score(y_test, y_pred_new , average="weighted")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
90 fits fa

0.9948834631428488

## with random forrest

In [125]:
model = RandomForestClassifier()
# params = {"n_estimators" : [20 , 50 , 100 , 150] , "criterion" :['gini', 'entropy', 'log_loss'] , "max_depth" : [2,3,4,5,] ,  }
params = {"n_estimators" : [50 ,100 ,150] , "criterion" :['gini', 'entropy', 'log_loss'] ,}
grid = GridSearchCV(estimator=model ,param_grid=params , cv = 5)
grid.fit(X_train , y_train)
y_pred =grid.predict(X_test)

# model.fit(X_train , y_train)
# y_pred =model.predict(X_test)
f1_score(y_test, y_pred , average="weighted")


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

0.9948834631428488

## with knn

In [126]:
model = KNeighborsClassifier()
params = {"n_neighbors" : [2,3,4,],
          "weights": ['uniform', 'distance'],
          "algorithm" : ['auto', 'ball_tree', 'kd_tree', 'brute']}
grid = GridSearchCV(estimator=model , param_grid=params , cv = 5)
# model.fit(X_train , y_train)
# y_pred =model.predict(X_test)
grid.fit(X_train , y_train)
y_pred =grid.predict(X_test)
f1_score(y_test, y_pred , average="weighted")

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


0.9948834631428488

## Word2vec

In [127]:
'''
for running this block properly You have to download the GoogleNews-vectors-negative300.bin file .
due to its large size I didnt put it in the project folder !!!!


'''
# # # Path to the downloaded model
# # print(pathlib.Path().resolve())
curr = str(pathlib.Path().resolve()) +'\GoogleNews-vectors-negative300.bin'
# print(curr)
# model_path = 'C:\Users\ma-na\OneDrive\Desktop\data minding final project\GoogleNews-vectors-negative300.bin'
# # # Load the model
word2vec = KeyedVectors.load_word2vec_format(curr, binary=True)
def vectorizing(x):
    x = x.split()
    if x:
        vector = np.mean([word2vec[item] for item in x if item in word2vec])
    else:
        vector = np.zeros(word2vec.vector_size)
    return vector
X = dataset["review"].apply(vectorizing)

# # Example usage
# print(vector)
# TODO : complete this part


In [128]:
X = pd.DataFrame(X)

In [129]:
X_train , X_test , y_train , y_test = train_test_split(X ,  y ,random_state=42 , train_size=0.8)    
X.shape

# vector

(1465, 1)

## with logistic regression

In [130]:
model = LogisticRegression()
params = {"penalty": ['l1', 'l2' , None] ,
        #   "dual": [True , False] , 
        #   "fit_intercept": [True , False] ,
          "n_jobs": [-1]
          }
grid = GridSearchCV(model , param_grid=params  , cv = 5)
# model.fit(X_train , y_train)
# y_pred = model.predict(X_test)
grid.fit(X_train , y_train)
y_pred = grid.predict(X_test)
# model.fit(X_train , y_train)
# y_pred = model.predict(X_test)
f1_score(y_test, y_pred , average="weighted")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
5 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ma-na\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ma-na\AppData\Local\Programs\Python\Python

0.9948834631428488

## with random forrest

In [131]:
model = RandomForestClassifier()
params = {"n_estimators" : [50 ,100 ,150] , "criterion" :['gini', 'entropy', 'log_loss'] ,}
grid = GridSearchCV(estimator=model ,param_grid=params , cv = 5)
grid.fit(X_train , y_train)
y_pred =grid.predict(X_test)
# model.fit(X_train , y_train)
# y_pred =model.predict(X_test)
f1_score(y_test, y_pred , average="weighted")

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

0.9914587955672378

## with knn

In [132]:
model = KNeighborsClassifier()
params = {"n_neighbors" : [3,5,9,15],
          "weights": ['uniform', 'distance'],
        #   "algorithm" : ['auto', 'ball_tree', 'kd_tree', 'brute']
          }
grid = GridSearchCV(estimator=model , param_grid=params , cv = 5)
# model.fit(X_train , y_train)
# y_pred =model.predict(X_test)
grid.fit(X_train , y_train)
y_pred =grid.predict(X_test)
# model.fit(X_train , y_train)
# y_pred =model.predict(X_test)
f1_score(y_test, y_pred , average="weighted")

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


0.9948834631428488

In [133]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
X = model.encode(final_dataset["review"])
X

array([[-0.66931844,  0.07005092, -0.03197148, ..., -0.5097832 ,
         0.26397008,  0.46642902],
       [-0.49374557,  0.03867428,  0.29656616, ..., -0.39415205,
        -0.25303924,  0.39240253],
       [-0.88811755,  0.269119  , -0.4385834 , ..., -0.7842308 ,
         1.1376895 ,  0.8294344 ],
       ...,
       [-0.26414293,  0.08282697, -0.09534235, ..., -0.45954412,
        -0.511578  ,  0.16804142],
       [ 0.04979387,  0.2047221 ,  0.29428047, ...,  0.45216683,
         0.15995842,  0.40302965],
       [-0.32677102, -0.08397904, -0.3565794 , ..., -0.4340489 ,
        -0.32576972,  0.5141788 ]], dtype=float32)

In [134]:
X_train , X_test , y_train , y_test = train_test_split(X ,  y ,random_state=42 , train_size=0.8)    
X.shape


(1465, 384)

## with logistic regression

In [135]:
model = LogisticRegression()
params = {"penalty": ['l1', 'l2' , None] ,
        #   "dual": [True , False] , 
        #   "fit_intercept": [True , False] ,
          "n_jobs": [-1]
          }
grid = GridSearchCV(model , param_grid=params  , cv = 5)
# model.fit(X_train , y_train)
# y_pred = model.predict(X_test)
grid.fit(X_train , y_train)
y_pred = grid.predict(X_test)
# model.fit(X_train , y_train)
# y_pred = model.predict(X_test)
f1_score(y_test, y_pred , average="weighted")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
5 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ma-na\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ma-na\AppData\Local\Programs\Python\Python

0.9948834631428488

## with random forrest

In [136]:
model = RandomForestClassifier()
params = {"n_estimators" : [50 ,100 ,150] , "criterion" :['gini', 'entropy', 'log_loss'] ,}
grid = GridSearchCV(estimator=model ,param_grid=params , cv = 5)
grid.fit(X_train , y_train)
y_pred =grid.predict(X_test)
# model.fit(X_train , y_train)
# y_pred =model.predict(X_test)
f1_score(y_test, y_pred , average="weighted")

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

0.9948834631428488

## with knn

In [137]:
model = KNeighborsClassifier()
params = {"n_neighbors" : [3,5,9,15],
          "weights": ['uniform', 'distance'],
        #   "algorithm" : ['auto', 'ball_tree', 'kd_tree', 'brute']
          }
grid = GridSearchCV(estimator=model , param_grid=params , cv = 5)
# model.fit(X_train , y_train)
# y_pred =model.predict(X_test)
grid.fit(X_train , y_train)
y_pred =grid.predict(X_test)
# model.fit(X_train , y_train)
# y_pred =model.predict(X_test)
f1_score(y_test, y_pred , average="weighted")

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


0.9948834631428488