# Applying ML to sentiment analysis

## Preparing the IMDB Movie reviews dataset for sentiment analysis

https://ai.stanford.edu/~amaas/data/sentiment/

In [10]:
import numpy as np 
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False)
df = pd.read_csv('movie_data.csv')
df

Unnamed: 0,review,sentiment
0,This film is an attempt to present Jared Diamo...,0
1,Whereas the movie was beautifully shot and rea...,0
2,"The comparison is perhaps unfair, but inevitab...",0
3,It's hard and I didn't expect it... But it's r...,0
4,"Elvira Mistress of the Dark is just that, a ca...",1
...,...,...
49995,I own Ralph Bakshis forgotten masterpiece Fire...,1
49996,Jason Bourne sits in a dusty room in with bloo...,1
49997,"Two college buddies - one an uptight nerd, the...",1
49998,My school's drama club will be putting this sh...,1


## Bag of words model

In [15]:
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [16]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


## Assessing word relevancy via term frequency-inverse document  frequency

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,
                          norm='l2',
                          smooth_idf=True)

np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


## Cleaning text data

In [20]:
df.loc[10, 'review']

"Saw it at the Philadelphia Gay and Lesbian Film Fest.<br /><br />What can I say? Against my better judgment, I liked it, but it seemed to me that that acting was a little...weak (mostly I noticed this from the family of the teen boy). I mean, the script wasn't stellar to begin with, but the actors didn't make me believe the relationships.<br /><br />The plot is also predictable.<br /><br />Nonethelss, I liked it. The characters are likable, and the plot is not challenging or upsetting. It's sweet, the characters care about each other, and I don't count it as fifty minutes ill-spent. <br /><br />But I don't recommend it."

In [22]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = (re.sub('[\W]+', ' ', text.lower()) +' '.join(emoticons).replace('-', ''))
    return text

In [23]:
preprocessor(df.loc[10, 'review'])

'saw it at the philadelphia gay and lesbian film fest what can i say against my better judgment i liked it but it seemed to me that that acting was a little weak mostly i noticed this from the family of the teen boy i mean the script wasn t stellar to begin with but the actors didn t make me believe the relationships the plot is also predictable nonethelss i liked it the characters are likable and the plot is not challenging or upsetting it s sweet the characters care about each other and i don t count it as fifty minutes ill spent but i don t recommend it '

In [24]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [25]:
df['review'] = df['review'].apply(preprocessor)

## Processing documents into tokens

In [26]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [28]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [30]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'lot']

## Training a logistic regression model for document classification

In [31]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)
small_param_grid = [
    {
         'vect__ngram_range': [(1, 1)],
         'vect__stop_words': [None],
         'vect__tokenizer': [tokenizer, tokenizer_porter],
         'clf__penalty': ['l2'],
         'clf__C': [1.0, 10.0]
     },
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer],
        'vect__use_idf':[False],
        'vect__norm':[None],
        'clf__penalty': ['l2'],
        'clf__C': [1.0, 10.0]
    },
 ]
lr_tfidf = Pipeline([
                        ('vect', tfidf),
                        ('clf', LogisticRegression(solver='liblinear'))
                    ])
gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid,
                           scoring='accuracy', cv=5,
                           verbose=2, n_jobs=5)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


10 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Aksha\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Aksha\anaconda3\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Aksha\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 653, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\

0,1,2
,estimator,Pipeline(step...liblinear'))])
,param_grid,"[{'clf__C': [1.0, 10.0], 'clf__penalty': ['l2'], 'vect__ngram_range': [(1, ...)], 'vect__stop_words': [None], ...}, {'clf__C': [1.0, 10.0], 'clf__penalty': ['l2'], 'vect__ngram_range': [(1, ...)], 'vect__norm': [None], ...}]"
,scoring,'accuracy'
,n_jobs,5
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,<function tok...002B4BC693560>
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100
