# 1 Introduction to Natural Language Processing.

In [217]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
# import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
# from textblob import TextBlob
# from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import math
import warnings
warnings.filterwarnings('ignore')


I used https://www.kaggle.com/code/lakshmi25npathi/sentiment-analysis-of-imdb-movie-reviews this link to learn and implement this task.

#### In this lab, you will be working with the IMBD movie review dataset to perform various natural language processing tasks. Using the provided dataset, you will need to:

##### Loading the dataset

In [218]:
#importing the training data
imdb_data=pd.read_csv('imdb_dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [219]:
#Summary of the dataset
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [220]:
#sentiment count
# Count the number of positive and negative classes in the dataset
imdb_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

<H3> Due to extreme computational requirements, I am running this on a dataset of 1000 sentences.

In [221]:
imdb_data = imdb_data.iloc[:1000,:]

### 1. Perform tokenization on the review text.

In [222]:
#Tokenization of text
tokenizer=ToktokTokenizer()
text = tokenizer.tokenize(imdb_data['review'])
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')
print("Some stop words are:\n")
stopword_list[0:10]

Some stop words are:



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

### 2. Remove stop words from the tokenized text.

In [223]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
    
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_stopwords)

### 3. Use regular expressions to clean the text, removing any HTML tags, emails, and other unnecessary information.

In [224]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text


In [225]:
#Apply functions on review column
imdb_data['review']=imdb_data['review'].apply(denoise_text)
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_special_characters)

In [226]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(simple_stemmer)

### 4. Convert the cleaned data into a TF-IDF and BOW representation from scratch.

 BoW and TF-IDF are techniques that help us convert text sentences into numeric vectors.

<h2> BoW model

In [227]:
df = pd.read_csv('imdb_dataset.csv')['review'][:1000]
df.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [228]:
def tokenize(sentences):
    words = []
    for sentence in sentences:
        w = word_extraction(sentence)
        words.extend(w)
        
    words = sorted(list(set(words)))
    return words

def word_extraction(sentence):
    ignore = ['a', "the", "is"]
    words = re.sub("[^\w]", " ",  sentence).split()
    cleaned_text = [w.lower() for w in words if w not in ignore]
    return cleaned_text    
    
def generate_bow(allsentences):    
    words = word_extraction(allsentences)
    bag_vector = np.zeros(len(words))
    for w in words:
        for i,word in enumerate(words):
            if word == w: 
                bag_vector[i] += 1
                    
    print("\n{0}\n".format(np.array(bag_vector)))

In [229]:
df[100]

"This short film that inspired the soon-to-be full length feature - Spatula Madness - is a hilarious piece that contends against similar cartoons yielding multiple writers. The short film stars Edward the Spatula who after being fired from his job, joins in the fight against the evil spoons. This premise allows for some funny content near the beginning, but is barely present for the remainder of the feature. This film's 15-minute running time is absorbed by some odd-ball comedy and a small musical number. Unfortunately not much else lies below it. The plot that is set up doesn't really have time to show. But it's surely follows it plot better than many high-budget Hollywood films. This film is worth watching at least a few times. Take it for what it is, and don't expect a deep story."

In [230]:
generate_bow(df[100])


[4. 2. 4. 3. 1. 1. 2. 1. 1. 1. 2. 2. 1. 1. 1. 3. 1. 2. 1. 1. 1. 1. 1. 2.
 2. 4. 1. 1. 2. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 1. 1. 4. 1. 1. 3. 2. 1.
 1. 1. 1. 2. 1. 1. 3. 1. 1. 2. 4. 4. 2. 1. 1. 1. 2. 1. 1. 2. 1. 1. 1. 2.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 5. 2. 2. 3. 1. 1. 1. 2. 1. 1. 2. 2. 1. 2. 5.
 2. 1. 1. 5. 2. 1. 1. 1. 1. 1. 1. 1. 4. 4. 1. 1. 1. 1. 1. 1. 1. 5. 3. 1.
 5. 2. 1. 2. 1. 1. 1.]



<h2> TF-IDF model

In [231]:
sentences = []
word_set = []
# Finding all the unique words from the 
for sent in imdb_data.review:
    words = [word.lower() for word in word_tokenize(sent) if word.isalpha()]
    sentences.append(words)
    for word in words:
        if word not in word_set:
            word_set.append(word)

In [232]:
# Creating an index for each word in our vocab.
index_dict = {} #Dictionary to store index for each word
i = 0
for word in word_set:
    index_dict[word] = i
    i += 1

In [233]:
#Create a count dictionary
def count_dict(sentences):
    word_count = {}
    for word in word_set:
        word_count[word] = 0
        for sent in sentences:
            if word in sent:
                word_count[word] += 1
    return word_count
 
word_count = count_dict(sentences)

### Term Frequency (TF)
Let’s first understand Term Frequent (TF). It is a measure of how frequently a term, t, appears in a document, d:<br>

Term Frequency (tf) formula = (n is the number of times the term “t” appears in the document “d”)/(total no of terms in teh document)

In [234]:
#Term Frequency
def termfreq(document, word):
    N = len(document)
    occurance = len([token for token in document if token == word])
    return occurance/N

### Inverse Document Frequency (IDF)
IDF is a measure of how important a term is. We need the IDF value because computing just the TF alone is not sufficient to understand the importance of words:<br>
IDF(‘word’) =  log(number of documents/number of documents containing the word ‘word’)

In [235]:
#Inverse Document Frequency
 
def inverse_doc_freq(word):
    try:
        word_occurance = word_count[word] + 1
    except:
        word_occurance = 1
    return np.log(len(sentences)/word_occurance)

Vectorize the words

In [236]:
def tf_idf(sentence):
    tf_idf_vec = np.zeros((len(word_set),))
    for word in sentence:
        tf = termfreq(sentence,word)
        idf = inverse_doc_freq(word)
        value = tf*idf
        tf_idf_vec[index_dict[word]] = value 
    return tf_idf_vec

In [237]:
#TF-IDF Encoded text corpus
vectors = []
for sent in sentences:
    vec = tf_idf(sent)
    vectors.append(vec)
print(vectors)

[array([0.00367888, 0.01580995, 0.01580995, ..., 0.        , 0.        ,
       0.        ]), array([0.00689261, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ]), array([0.00722478, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ]), array([0., 0., 0., ..., 0., 0., 0.]), array([0.02855509, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ]), array([0.01110476, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ]), array([0., 0., 0., ..., 0., 0., 0.]), array([0.00778775, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ]), array([0.01934377, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ]), array([0., 0., 0., ..., 0., 0., 0.]), array([0.01199314, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ]), array([0., 0., 0., ..., 0., 0., 0.]), array([0.00263007, 0.        , 0.01130273, ..., 0.        , 0.        ,
       0.        ]), array([0.,