In [None]:
# Task1: Import spacy

In [None]:
import spacy
m = spacy.load('en_core_web_sm')

In [None]:
#logical regression - models
#count & Tf-idf vectorization - vectorization

In [None]:
# Task2: Download dataset

In [None]:
import pandas as pd
df = pd.read_csv('IMDB Dataset.csv')

In [None]:
#After loading, lets introspect this data
df.head(10) #First 10 reviews and sentiments

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [None]:
# the reviews may be longer, so we will take a long review
df['review'][10] 

'Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines.<br /><br />At first it was very odd and pretty funny but as the movie progressed I didn\'t find the jokes or oddness funny anymore.<br /><br />Its a low budget film (thats never a problem in itself), there were some pretty interesting characters, but eventually I just lost interest.<br /><br />I imagine this film would appeal to a stoner who is currently partaking.<br /><br />For something similar but better try "Brother from another planet"'

In [None]:
# we can remove unnecessary data by stemming

In [None]:
#Task 3: Stemming - (to remove suffix)

In [None]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer() #instanting the stemmer

In [None]:
def stemmer_tokenize (text):
  return [porter.stem(word) for word in text.split()]

In [None]:
stemmer_tokenize('coders like coding and thus they code ')

['coder', 'like', 'code', 'and', 'thu', 'they', 'code']

In [None]:
#remove stopwords

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#An example of removing stopword:- 

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = """This is a sample sentence,
				showing off the stop words filtration."""

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sent)
# converts the words in word_tokens to lower case and then checks whether
#they are present in stop_words or not
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
#with no lower case conversion
filtered_sentence = []

for w in word_tokens:
	if w not in stop_words:
		filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)


['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [None]:
# Task 4: TF-IDF Vectorizer
#Term frequencies alone do not contribute 
#tf-idf(t,d) = tf(t,d) X idf(t,D)
#idf(t,d) 

In [None]:
#count vectorizer only give binary values - tfidf can give continous values

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents = None,
                        lowercase=False,
                        tokenizer=stemmer_tokenize,
                        use_idf = True,
                        norm='l2',# norm is distance from origin
                        smooth_idf=True)
Y = df.sentiment.values
X = tfidf.fit_transform(df.review)



In [None]:
# Task 6: Document Classification Using

In [None]:
from sklearn.model_selection import train_test_split #

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,random_state=1,test_size=0.5)#50% data for test



In [None]:
import pickle
from sklearn.linear_model import LogisticRegressionCV #cross validation - how many folds u want

clf = LogisticRegressionCV(cv = 5,
                     scoring = 'accuracy',
                     random_state = 0,
                     n_jobs = 1,
                     verbose = 2,
                     max_iter = 300).fit(X_train,Y_train)
#saving the model
saved_model = open('saved_model.sav','wb')

#using the pickle library dump function to 
pickle.dump(clf,saved_model)

#close the saved model
saved_model.close()
                  

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.6min finished


In [None]:
# Task8: Model evaluation

In [None]:
filename = 'saved_model.sav'

#we will use a pickle function to load the saved 
saved_clf = pickle.load(open(filename,'rb'))

#test the saved model on test data
saved_clf.score(X_test,Y_test)

0.89