In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import nltk
from textblob import Word, TextBlob
from wordcloud import WordCloud

## Loading the dataset

In [2]:
df = pd.read_csv('IMDB_Dataset.csv')
df_copy = df.copy()
df_copy.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
print(f"The total row in dataset is: {len(df)} \nAnd column is: {len(df.columns)}")

The total row in dataset is: 50000 
And column is: 2


## Now data cleaning.
**Remove HTML tags if it has**

In [4]:
import re
def html_remove(text):
    tags_html = re.sub(r'<.*?>|\d+', '', text)
    return tags_html
colmn_names  = ['review']
for html in colmn_names:
    df[html] =  df[html].apply(html_remove)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


**if there is url it has to be remove**

In [5]:
def url_remove(text):
    #url_tag = re.sub(r'http\S+|www\S+|https\S+', '', text)
    url_tag = re.sub(r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*", '', text)
    return url_tag

colmn_names
for url in colmn_names:
    df[url] = df[url].apply(url_remove)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


**Puncuation Removing**

In [7]:
def punc_remov(text):
    puncremove = re.sub(r'[^\w\s]', '', text)
    return puncremove

colmn_names
for col_name in colmn_names:
    df[col_name] = df[col_name].apply(punc_remov)
    
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [8]:
df['review'].iloc[0]

'One of the other reviewers has mentioned that after watching just  Oz episode youll be hooked They are right as this is exactly what happened with  first thing that struck me about Oz was its brutality and unflinching scenes of violence which set in right from the word GO Trust me this is not a show for the faint hearted or timid This show pulls no punches with regards to drugs sex or violence Its is hardcore in the classic use of the  is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary It focuses mainly on Emerald City an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda Em City is home to  Muslims gangstas Latinos Christians Italians Irish and  scuffles death stares dodgy dealings and shady agreements are never far awayI would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare Forget pretty pictures painted for mainstream au

**check the words is lowercase or not if not then i will convert into lower.**

In [10]:
import string
def lowercase(df, col_names):
    for col_name in col_names:
        df[col_name] = df[col_name].str.lower()
    return df
colmn_names
df = lowercase(df, colmn_names)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


# Tokenization

- it uses to convert sentence into words.

In [11]:
from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize
from spacy.lang.en import English # for word tokenize
from spacy.lang.en import English # for sentence tokenize

In [12]:
def token(text):
    word = word_tokenize(text)
    return word

for col in colmn_names:
    df[col] = df[col].apply(token)
    
df.head()

Unnamed: 0,review,sentiment
0,"[one, of, the, other, reviewers, has, mentione...",positive
1,"[a, wonderful, little, production, the, filmin...",positive
2,"[i, thought, this, was, a, wonderful, way, to,...",positive
3,"[basically, theres, a, family, where, a, littl...",negative
4,"[petter, matteis, love, in, the, time, of, mon...",positive


# Remove stopword
- stop word -> a, an, the, etc

In [13]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ak352\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ak352\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
def remove_stopword(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words
colmn_names
for col in colmn_names:
    df[col] = df[col].apply(lambda x: remove_stopword(x))

In [15]:
df.head()

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, oz, epis...",positive
1,"[wonderful, little, production, filming, techn...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, theres, family, little, boy, jake,...",negative
4,"[petter, matteis, love, time, money, visually,...",positive


## Stemming
- some tenchinique of stemming.
   - porterstemmer
   - Lancaster Stemmer
   - Snowball Stemmer

In [16]:
# from nltk.stem import PorterStemmer
# def stemmings(text):
#     port_stemmer = PorterStemmer()
#     stem_word = [port_stemmer.stem(txt) for txt in text]
#     return stem_word

# colmn_names
# for col in colmn_names:
#     df[col] = df[col].apply(stemmings)

In [17]:
# from nltk.stem import SnowballStemmer
# def perform_stemming(words):
#     snowball_stemmer = SnowballStemmer("english")
#     stemmed_words = [snowball_stemmer.stem(word) for word in words]
#     return stemmed_words
# for col in colmn_names:
#     df[col] = df[col].apply(perform_stemming)

# Lematization
- many techinique of lemmatization
   - WordNet Lemmatizer
   - Spacy Lemmatization
   - NLTK Lemmatization
   - Stanford Lemmatizer etc

In [18]:
# pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl

## WordNetLemmatizer 

In [19]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [20]:
def lemmatize(word):
    doc = nlp(str(word))  # Ensure the input is a string
    lemma_text = ' '.join([token.lemma_ for token in doc])
    return lemma_text

colmn_names
for col in colmn_names:
    df[col] = df[col].apply(lemmatize)

**Cleaned data**

In [23]:
df.head()

Unnamed: 0,review,sentiment
0,"[ ' one ' , ' reviewer ' , ' mention ' , ' wat...",positive
1,"[ ' wonderful ' , ' little ' , ' production ' ...",positive
2,"[ ' thought ' , ' wonderful ' , ' way ' , ' sp...",positive
3,"[ ' basically ' , ' there s ' , ' family ' , '...",negative
4,"[ ' petter ' , ' matteis ' , ' love ' , ' time...",positive


**Raw data**

In [24]:
df_copy.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
