In [10]:
## Stopwords, punctuation marks, numerical values, special characters

In [4]:
import pandas as pd
import re
import string
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [5]:
df = pd.read_csv('C:/Users/Aaryan/Desktop/Random/Datasets/IMDB Dataset.csv')
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [9]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

---

## Preprocessing the text 

In [10]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    ## Converts the text to lowercase
    text = text.lower()

    ## Removing HTML tags from the text
    text = re.sub(r'<[^>]+>', '', text)

    ## Removing the special characters
    text = text.translate(str.maketrans('', '', string.punctuation))

    ## Removing the numerical characters
    text = re.sub(r'\d+', '', text)

    ## Removing the URLs
    text = re.sub(r'http\S+www\S+https\S+', '', text)

    # ## Removing non-alphanumeric characters
    # text = re.sub(r'^[a-z\s]', '', text)

    ## Removing the extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    ## Removing the stop words from the text
    text = ' '.join(word for word in text.split() if word not in stop_words)
    
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aaryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
df['cleaned_reviews'] = df['review'].apply(clean_text)

In [12]:
df.head(10)

Unnamed: 0,review,sentiment,cleaned_reviews
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...
5,"Probably my all-time favorite movie, a story o...",positive,probably alltime favorite movie story selfless...
6,I sure would like to see a resurrection of a u...,positive,sure would like see resurrection dated seahunt...
7,"This show was an amazing, fresh & innovative i...",negative,show amazing fresh innovative idea first aired...
8,Encouraged by the positive comments about this...,negative,encouraged positive comments film looking forw...
9,If you like original gut wrenching laughter yo...,positive,like original gut wrenching laughter like movi...


---

## Pretrained Model

In [15]:
import gensim.downloader as api
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize

In [23]:
wv_pretrained = api.load("word2vec-google-news-300")



In [24]:
wv_pretrained.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.7118193507194519),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321839332581),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593831062317),
 ('monarchy', 0.5087411999702454)]

---

## Morning - Sunrise + Sunset ~ Evening

In [25]:
wv_pretrained.most_similar(positive=["morning", "sunset"], negative=["sunrise"])

[('afternoon', 0.6661120653152466),
 ('evening', 0.5867564082145691),
 ('night', 0.5096079707145691),
 ('midday', 0.4837777018547058),
 ('Friday', 0.47944194078445435),
 ('moring', 0.4663405120372772),
 ('Monday', 0.46335235238075256),
 ('lunchtime', 0.4572029113769531),
 ('Thursday', 0.4479769468307495),
 ('week', 0.4470955729484558)]

---

## Cat - Meow + Bark ~ Dog

In [26]:
wv_pretrained.most_similar(positive=["cat", "bark"], negative=["meow"])

[('dog', 0.5506407618522644),
 ('cats', 0.5125732421875),
 ('pet', 0.49706852436065674),
 ('puppy', 0.49097704887390137),
 ('pup', 0.47735899686813354),
 ('fox', 0.476655513048172),
 ('snake', 0.46881598234176636),
 ('tabby', 0.46012791991233826),
 ('dogs', 0.458560049533844),
 ('Doberman', 0.4543886184692383)]

---

## Painter - Brush + Pen ~ Writer

In [27]:
wv_pretrained.most_similar(positive=["painter", "pen"], negative=["brush"])

[('printmaker', 0.5255544781684875),
 ('artist', 0.5227867364883423),
 ('illustrator', 0.5209603905677795),
 ('lithographer', 0.5072831511497498),
 ('impressionist_painter', 0.5058237910270691),
 ('typographer', 0.4958345890045166),
 ('painter_illustrator', 0.4778297245502472),
 ('calligrapher', 0.477349191904068),
 ('poet', 0.47586342692375183),
 ('expressionist_painter', 0.4719168543815613)]

---

## Summer - Hot + Cold ~ Winter

In [30]:
wv_pretrained.most_similar(positive=["summer", "cold"], negative=["hot"])

[('winter', 0.6970129013061523),
 ('autumn', 0.5658621191978455),
 ('spring', 0.5643627047538757),
 ('winters', 0.5315064191818237),
 ('wintertime', 0.511982262134552),
 ('summers', 0.5104738473892212),
 ('summertime', 0.510020911693573),
 ('midwinter', 0.5091511607170105),
 ('springtime', 0.493204802274704),
 ('warmer_weather', 0.4870986044406891)]

---

## Custom Skipgram Model

In [13]:
cleaned_reviews = df['cleaned_reviews'].apply(lambda x: x.split()).tolist()

In [17]:
skip_gram_model = Word2Vec(sentences = cleaned_reviews, vector_size=300, window=5, min_count=5, sg=1)

In [18]:
skip_gram_model.save("skipgram_word2vec.model")

In [19]:
# Convert reviews to vectors by averaging word vectors
def vectorize_reviews(reviews, model):
    vectors = []
    for review in reviews:
        vectors.append(np.mean([model.wv[word] for word in review if word in model.wv], axis=0))
    return np.array(vectors)

In [22]:
X = vectorize_reviews(df['cleaned_reviews'].apply(lambda x: x.split()).tolist(), skip_gram_model)
y = df['sentiment'].values

In [23]:
X = np.nan_to_num(X)

In [24]:
xtr, xts, ytr, yts = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(xtr, ytr)

In [26]:
ypr = rfc.predict(xts)
print(classification_report(yts, ypr))

              precision    recall  f1-score   support

    negative       0.86      0.83      0.84      4961
    positive       0.83      0.86      0.85      5039

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



---

## Custom CBoW Model

In [28]:
cbow_model = Word2Vec(sentences = cleaned_reviews, vector_size=300, window=5, min_count=5, sg=0)

In [29]:
cbow_model.save("cbow_word2vec.model")

In [30]:
X = vectorize_reviews(df['cleaned_reviews'].apply(lambda x: x.split()).tolist(), cbow_model)

In [31]:
X = np.nan_to_num(X)

In [32]:
xtr, xts, ytr, yts = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(xtr, ytr)

In [36]:
ypr = rfc.predict(xts)
print(classification_report(yts, ypr))

              precision    recall  f1-score   support

    negative       0.85      0.82      0.84      4961
    positive       0.83      0.86      0.84      5039

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



---

## Pretrained word2vec model

In [37]:
from gensim.models import KeyedVectors

In [38]:
pretrained_model = KeyedVectors.load_word2vec_format('path/to/GoogleNews-vectors-negative300.bin', binary=True)

FileNotFoundError: [Errno 2] No such file or directory: 'path/to/GoogleNews-vectors-negative300.bin'

In [None]:
X = vectorize_reviews(df['cleaned_reviews'].apply(lambda x: x.split()).tolist(), pretrained_model)

In [None]:
X = np.nan_to_num(X)

In [None]:
xtr, xts, ytr, yts = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(xtr, ytr)

In [None]:
ypr = clf.predict(xts)
print(classification_report(yts, ypr))

----

## Tuned Skipgram Model

In [39]:
skip_gram_model = Word2Vec(sentences=cleaned_reviews, vector_size=200, window=10, min_count=2, sg=1)
skip_gram_model.save("tuned_skipgram_word2vec.model")

In [40]:
X = vectorize_reviews(df['cleaned_reviews'].apply(lambda x: x.split()).tolist(), skip_gram_model)

In [41]:
X = np.nan_to_num(X)

In [42]:
xtr, xts, ytr, yts = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
rfc = RandomForestClassifier(n_estimators=200, random_state=42)
rfc.fit(xtr, ytr)

In [44]:
ypr = rfc.predict(xts)
print(classification_report(yts, ypr))

              precision    recall  f1-score   support

    negative       0.87      0.84      0.85      4961
    positive       0.84      0.88      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



---

## Tuned CBoW model

In [45]:
cbow_model = Word2Vec(sentences=cleaned_reviews, vector_size=200, window=10, min_count=2, sg=0)
cbow_model.save("tuned_cbow_word2vec.model")

In [46]:
X = vectorize_reviews(df['cleaned_reviews'].apply(lambda x: x.split()).tolist(), cbow_model)

In [47]:
X = np.nan_to_num(X)

In [48]:
xtr, xts, ytr, yts = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
rfc = RandomForestClassifier(n_estimators=200, random_state=42)
rfc.fit(xtr, ytr)

In [50]:
ypr = rfc.predict(xts)
print(classification_report(yts, ypr))

              precision    recall  f1-score   support

    negative       0.86      0.83      0.84      4961
    positive       0.84      0.87      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



----

## **Final Results**

| Model                    | Vector Size | Window Size | Min Count | Accuracy | Precision | Recall | F1-Score |
|:------------------------:|:-----------:|:-----------:|:---------:|:--------:|:---------:|:------:|:--------:|
| Custom Skip-gram   | 300         | 5           | 5         | 0.84     | 0.83       | 0.86    | 0.85      |
| Custom CBOW        | 300    | 5   | 5  | 0.84      | 0.83       | 0.86    | 0.84      |
| Pretrained Word2Vec| 300    | N/A | N/A | ...      | ...       | ...    | ...      |
| Tuned Skip-gram    | 200 | 10    |  2  | 0.86  | 0.84    | 0.88    | 0.86     |
| Tuned CBOW               | 200 | 10 | 2  | 0.85 | 0.84   | 0.87 | 0.85      |


---