In [1]:
# Important Library

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import contractions
from unidecode import unidecode
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from autocorrect import Speller

In [2]:
# Getting Data

data = pd.read_csv("Train.csv")
data.head(2)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0


In [3]:
data['text'][0]

'I grew up (b. 1965) watching and loving the Thunderbirds. All my mates at school watched. We played "Thunderbirds" before school, during lunch and after school. We all wanted to be Virgil or Scott. No one wanted to be Alan. Counting down from 5 became an art form. I took my children to see the movie hoping they would get a glimpse of what I loved as a child. How bitterly disappointing. The only high point was the snappy theme tune. Not that it could compare with the original score of the Thunderbirds. Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created. Jonatha Frakes should hand in his directors chair, his version was completely hopeless. A waste of film. Utter rubbish. A CGI remake may be acceptable but replacing marionettes with Homo sapiens subsp. sapiens was a huge error of judgment.'

In [4]:
# Preprocessing of the data

def remove_spaces(data):
    clean_text = data.replace('\\n',' ').replace('\t',' ').replace('\\',' ')
    return clean_text

def expand_text(data):
    expanded_text = contractions.fix(data)
    return expanded_text
    
def handling_accented(data):
    fixed_text = unidecode(data)
    return fixed_text

stopword_list = stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
stopword_list.remove('nor')

def clean_data(data):
    tokens = word_tokenize(data)
    clean_text = [word.lower() for word in tokens if (word not in punctuation) and (word.lower() not in stopword_list) and (len(word)>2) and (word.isalpha())]
    return clean_text

def autocorrection(data):
    spell = Speller(lang='en')
    corrected_text= spell(data)
    return corrected_text

def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    final_data=[]
    for word in data:
        lemmatized_word = lemmatizer.lemmatize(word)
        final_data.append(lemmatized_word)
    return ' '.join(final_data)    

In [5]:
x_train,x_test,y_train,y_test = train_test_split(data.text,data.label,test_size=0.25,random_state=42)

In [6]:
clean_text_train = x_train.apply(remove_spaces)
clean_text_test = x_test.apply(remove_spaces)

clean_text_train = clean_text_train.apply(expand_text)
clean_text_text = clean_text_test.apply(expand_text)

clean_text_train = clean_text_train.apply(handling_accented)
clean_text_test = clean_text_test.apply(handling_accented)

clean_text_train = clean_text_train.apply(clean_data)
clean_text_test = clean_text_test.apply(clean_data)

clean_text_train = clean_text_train.apply(lemmatization)
clean_text_test = clean_text_test.apply(lemmatization)

In [7]:
clean_text_train

26898    fifth grade language art teacher read book stu...
27635    low budget brit pop melodrama focus girl want ...
3036     well watched movie little year ago pulled dust...
5604     would almost give however confusing part well ...
36111    full length feature film world bridge found fi...
                               ...                        
6265     movie one worst movie ever seen life waste tim...
11284    movie inspiring anyone tough jam whether finan...
38158    east side story documentary musical comedy sta...
860      one boot one point doctor assistant refers wor...
15795    movie horrible lighting terrible camera moveme...
Name: text, Length: 30000, dtype: object

In [8]:
# EDA
# ngrams

from nltk.util import ngrams
def splitting_dataframe(data):
    tokens = data.split()
    return tokens
data = clean_text_test.apply(splitting_dataframe)

def ngram_list(data,ngram_range):
    ngram = ngrams(data,ngram_range) 
    ngram_list1 = []
    for ngram1 in ngram:
        ngram_list1.append(' '.join(ngram1))
    return ngram_list1    

unigrams = data.apply(lambda x : ngram_list(x,1))

In [9]:
unigrams

32823    [central, theme, movie, seems, confusion, rela...
16298    [excellent, example, cowboy, noir, called, une...
28505    [ending, made, heart, jump, throat, proceeded,...
6689     [chosen, one, appreciate, quality, story, char...
26893    [really, funny, film, especially, second, thir...
                               ...                        
29415    [film, came, gift, offering, blue, unlike, rev...
11359    [first, started, watching, movie, looking, kin...
575      [big, mark, music, neil, young, glowing, prais...
17398    [watching, lady, ermine, wondering, betty, gra...
4189     [crappy, miserably, acted, movie, based, subli...
Name: text, Length: 10000, dtype: object

In [10]:
Bigrams = data.apply(lambda x : ngram_list(x,2))
Bigrams

32823    [central theme, theme movie, movie seems, seem...
16298    [excellent example, example cowboy, cowboy noi...
28505    [ending made, made heart, heart jump, jump thr...
6689     [chosen one, one appreciate, appreciate qualit...
26893    [really funny, funny film, film especially, es...
                               ...                        
29415    [film came, came gift, gift offering, offering...
11359    [first started, started watching, watching mov...
575      [big mark, mark music, music neil, neil young,...
17398    [watching lady, lady ermine, ermine wondering,...
4189     [crappy miserably, miserably acted, acted movi...
Name: text, Length: 10000, dtype: object

In [11]:
Trigrams = data.apply(lambda x : ngram_list(x,3))
Trigrams

32823    [central theme movie, theme movie seems, movie...
16298    [excellent example cowboy, example cowboy noir...
28505    [ending made heart, made heart jump, heart jum...
6689     [chosen one appreciate, one appreciate quality...
26893    [really funny film, funny film especially, fil...
                               ...                        
29415    [film came gift, came gift offering, gift offe...
11359    [first started watching, started watching movi...
575      [big mark music, mark music neil, music neil y...
17398    [watching lady ermine, lady ermine wondering, ...
4189     [crappy miserably acted, miserably acted movie...
Name: text, Length: 10000, dtype: object

In [16]:
# Feature Engineering
# Count Vectorizer

count = CountVectorizer(max_df=0.95,max_features=1000)
count_val_train = count.fit_transform(clean_text_train)
count_val_test = count.transform(clean_text_test)

In [17]:
pd.DataFrame(count_val_train.A,columns=count.get_feature_names())

Unnamed: 0,ability,able,absolutely,accent,across,act,acted,acting,action,actor,...,wrong,wrote,yeah,year,yes,yet,york,young,younger,zombie
0,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
29996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
29998,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [71]:
# Model Building By using Navie Bayes Classifier


count_mnb = MultinomialNB()
count_mnb.fit(count_val_train.A,y_train)
predict_count = count_mnb.predict(count_val_test.A)
accuracy_count = accuracy_score(y_test,predict_count)*100
accuracy_count

83.11

In [19]:
tfidf = TfidfVectorizer(max_df=0.95,max_features=1000)
tfidf_train = tfidf.fit_transform(clean_text_train)
tfidf_test = tfidf.transform(clean_text_test)

In [20]:
pd.DataFrame(tfidf_train.A,columns= tfidf.get_feature_names())

Unnamed: 0,ability,able,absolutely,accent,across,act,acted,acting,action,actor,...,wrong,wrote,yeah,year,yes,yet,york,young,younger,zombie
0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.182468,0.0,0.0,...,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.00000,0.000000,0.0,0.055835,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.137926,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.321531,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0.0,0.0,0.000000,0.000000,0.0,0.135608,0.0,0.088057,0.0,0.0,...,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
29996,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
29997,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.00000,0.000000,0.0,0.074143,0.0,0.0,0.0,0.0,0.0,0.0
29998,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.050570,0.0,0.0,...,0.07508,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
tfidf_mnb = MultinomialNB()
tfidf_mnb.fit(tfidf_train.A,y_train)
predict_tfidf = tfidf_mnb.predict(tfidf_test.A)
accuracy_tfidf = accuracy_score(y_test,predict_tfidf)*100
accuracy_tfidf

83.74000000000001