In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
import seaborn as sns

import zipfile
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , confusion_matrix

# data preprocess

In [36]:
! pip install emoji;

Collecting emoji
  Downloading emoji-2.11.1-py2.py3-none-any.whl (433 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.11.1


In [2]:
# i have zip file should extract it
zip_path = '/content/drive/MyDrive/IMDB- sentiment analysis/IMDB Dataset.csv.zip'
extract_to = '/content/'

# Unzipping the file
with zipfile.ZipFile(zip_path , 'r') as zip_ref :
    zip_ref.extractall(extract_to)

# List the files in the directory to verify
os.listdir(extract_to)

['.config', 'IMDB Dataset.csv', 'drive', 'sample_data']

In [3]:
csv_files = [f for f in os.listdir(extract_to) if f.endswith('.csv')]
csv_files

['IMDB Dataset.csv']

In [4]:
df = pd.read_csv('/content/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.count()

review       50000
sentiment    50000
dtype: int64

In [6]:
# we hav3 50k samples. for do faster , use 10k samples.
df = df.iloc[0:10000]

In [7]:
df.sentiment.value_counts()

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [8]:
# check has null or not
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [9]:
# maybe have some duplicate during crawl or...
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)
print(df.duplicated().sum())

17
0




```
# Basic Preprocessing
Remove tags
Remove url
lowercase
remove punctuation
remove chat word treatment
remove spelling correction
remove stopwords
handle emoji
stemming
lemmitization
```



In [10]:
# step 1 >> remove tags
df2 = df.copy()

In [11]:
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

df2['review'] = df['review'].apply(remove_tags)

print(df['review'].head())
print('\n')
print(df2['review'].head())
print('\n')

# Check  the original and modified reviews are the same or not
changes = df['review']!=df2['review']
print(changes)
num_changes = changes.sum()
print(f"Number of rows changed: {num_changes}")

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object


0    One of the other reviewers has mentioned that ...
1    A wonderful little production. The filming tec...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object


0        True
1        True
2        True
3        True
4        True
        ...  
9995    False
9996     True
9997    False
9998    False
9999     True
Name: review, Length: 9983, dtype: bool
Number of rows changed: 5812


In [12]:
# step 2 >> remove url
df3 = df2.copy()

In [13]:
def remove_url(raw_text) :
    pattern = r'https?://\S+|www\.\S+'
    cleaned_text = re.sub(pattern, '', raw_text)
    return cleaned_text

df3['review'] = df2['review'].apply(remove_url)

print(df2['review'].head())
print('\n')
print(df3['review'].head())
print('\n')

# Check  the original and modified reviews are the same or not
changes = df2['review']!=df3['review']
print(changes)
print('\n')
num_changes = changes.sum()
print(f"Number of rows changed: {num_changes}")

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. The filming tec...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object


0    One of the other reviewers has mentioned that ...
1    A wonderful little production. The filming tec...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object


0       False
1       False
2       False
3       False
4       False
        ...  
9995    False
9996    False
9997    False
9998    False
9999    False
Name: review, Length: 9983, dtype: bool


Number of rows changed: 26


In [14]:
# step 3 >> convert to lowercase
df3['review'] = df3['review'].apply(lambda x:x.lower())

In [15]:
# step 4 >> remove punctuations
df4 = df3.copy()

In [16]:
df4['review'] = df3['review'].apply(lambda x:x.translate(str.maketrans('','',string.punctuation )) )

# Check  the original and modified reviews are the same or not
changes = df4['review']!=df3['review']
print(changes)
print('\n')
num_changes = changes.sum()
print(f"Number of rows changed: {num_changes}")

0       True
1       True
2       True
3       True
4       True
        ... 
9995    True
9996    True
9997    True
9998    True
9999    True
Name: review, Length: 9983, dtype: bool


Number of rows changed: 9981


In [17]:
# step 5 >> chat word treatment
df5 = df4.copy()

In [18]:
def preprocess_chat_text(text):
    ## chat word treatment
    abbreviation_mapping = {
        "lol": "laugh out loud",
        "brb": "be right back",
        "omg": "oh my god"
        ## Add more mappings as needed
    }

    for abbreviation, expansion in abbreviation_mapping.items():
        text = text.replace(abbreviation, expansion)

    #print(text)

    ## Normalize common misspellings
    misspelling_mapping = {
        "u": "you",
        "gr8": "great",
        ## Add more mappings as needed
    }

    for misspelling, correction in misspelling_mapping.items():
        text = re.sub(misspelling, correction, text)

    #print(text)


    return "".join(text)


## test the function
chat_text = "Hey, lol, brb in a sec :D"
processed_text = preprocess_chat_text(chat_text)
#print(processed_text)


df5['review'] = df4['review'].apply(preprocess_chat_text)
changes = df5['review']!=df4['review']
print('\n')
num_changes = changes.sum()
print(f"Number of rows changed: {num_changes}")



Number of rows changed: 9977


In [19]:
# step 6 >> correct spelling
df6 = df5.copy()

In [None]:
from textblob import TextBlob

df6['review'] = df5['review'].apply(lambda x: str(TextBlob(x).correct()))


changes = df6['review']!=df5['review']
print('\n')
num_changes = changes.sum()
print(f"Number of rows changed: {num_changes}")

In [31]:
# net step >> stop word
df7 = df6.copy()

In [32]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopword_english = stopwords.words('english')

def remove_stopwords(text) :
    new_text = []
    for t in text.split() :
        if t in stopword_english:
            continue
        else :
            new_text.append(t)
    #print(new_text)
    return ' '.join(new_text)

## test
text = 'hi my name is arman , i live in iran!.'
print(remove_stopwords(text))

df7['review'] = df6['review'].apply(remove_stopwords)


changes = df7['review']!=df6['review']
print('\n')
num_changes = changes.sum()
print(f"Number of rows changed: {num_changes}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


hi name arman , live iran!.


Number of rows changed: 9983


In [34]:
# next step >>  handel emoji
df8 = df7.copy()

In [44]:
# solution 1
def get_emoji_regexp():
    # Sort emoji by length to make sure multi-character emojis
    emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
    pattern = '(' + '|'.join(re.escape(u) for u in emojis) + ')'
    return re.compile(pattern)


## test it
exp = get_emoji_regexp()
print(exp.sub(repl='', string="Good morning! 😊🌞"))


df8['review'] = df7['review'].apply(lambda x:exp.sub(repl='', string=x))

changes = df8['review']!=df7['review']
print('\n')
num_changes = changes.sum()
print(f"Number of rows changed: {num_changes}")

Good morning! 


Number of rows changed: 0


In [54]:
# next step >>  stemming
df9 = df8.copy()

In [55]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def Steming(text) :
    new_text = [ps.stem(word) for word in text.split()]
    return ' '.join(new_text)


df9['review'] = df8['review'].apply(Steming)

changes = df8['review']!=df7['review']
print('\n')
num_changes = changes.sum()
print(f"Number of rows changed: {num_changes}")



Number of rows changed: 0


In [58]:
# next step >> lemmitization
df10 = df9.copy()

In [59]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

wl = WordNetLemmatizer()

def Lemitization(text) :
    new_text = [wl.lemmatize(word) for word in text.split()]
    return ' '.join(new_text)

## test
example_text = "The leaves on the tree are drying up and dogs are running faster than cats."
lemmatized_text = Lemitization(example_text)
print("Original Text:", example_text)
print("Lemmatized Text:", lemmatized_text)


df10['review'] = df9['review'].apply(Steming)

changes = df10['review']!=df9['review']
print('\n')
num_changes = changes.sum()
print(f"Number of rows changed: {num_changes}")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Text: The leaves on the tree are drying up and dogs are running faster than cats.
Lemmatized Text: The leaf on the tree are drying up and dog are running faster than cats.


Number of rows changed: 8793


In [64]:
# finish
df10.to_csv('process_imdb_text.csv')

# model

In [2]:
data = pd.read_csv('/content/process_imdb_text.csv')

In [3]:
data.drop('Unnamed: 0' , inplace=True , axis=1)
data.head()

Unnamed: 0,review,sentiment
0,one review mention watch jyoust 1 oz episod yo...,positive
1,wonderfyoul littl prodyouct film techniqy youn...,positive
2,thoyought wonderfyoul way spend time hot syoum...,positive
3,basic there famili littl boy jake think there ...,negative
4,petter mattei love time money visyoual styoun ...,positive


In [4]:
# devide data to feature and target

X = data['review']
Y = data['sentiment']

In [5]:
## label encoding for target

from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
y_encode = lb.fit_transform(Y)
y_encode

array([1, 1, 1, ..., 0, 0, 1])

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_encode, test_size=0.2, random_state=42)

In [7]:
print(X_train.shape)
print(y_train.shape)

(7986,)
(7986,)




```
apply BOW

```



In [15]:
## bag-of-word
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

In [16]:
X_train_bow.shape

(7986, 54369)

In [17]:
## NAIVE-BAYES ML model
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)

In [18]:
y_pred = gnb.predict(X_test_bow)

In [21]:
print( accuracy_score(y_test,y_pred) )
print( confusion_matrix(y_test,y_pred) )

0.6349524286429644
[[709 276]
 [453 559]]


In [22]:
## Random Forest ML algorithms
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
print( accuracy_score(y_test,y_pred) )
print( confusion_matrix(y_test,y_pred))

0.8477716574862293
[[835 150]
 [154 858]]


In [34]:
## as you see thr RandomForest result is better than NayveBayes
## now let check it the maximum feature of BOW and set a random number

print( len(cv.get_feature_names_out()) ) #54369

## now want to decrease these features to see the  result
cv_3000 = CountVectorizer(max_features=3000)
X_train_bow = cv_3000.fit_transform(X_train).toarray()
X_test_bow = cv_3000.transform(X_test).toarray()
rf = RandomForestClassifier()
rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

54369
0.8367551326990486
[[833 152]
 [174 838]]




```
# N-gram
```



In [35]:
## lets see n-gram

cv = CountVectorizer(ngram_range=(1,2),max_features=5000)

X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.8367551326990486
[[833 152]
 [174 838]]




```
# lets try tf-idf to see result
```



In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
tfidf = TfidfVectorizer()

In [10]:
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test)

In [13]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)

print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.8462694041061593
[[847 138]
 [169 843]]
