### Text classification 

** Here we have movie review dataset with two classes(neg and pos)** 

In [2]:
import numpy as np
import re
import pickle
#import nltk
#from nltk.corpus import stopwords
from sklearn.datasets import load_files
import pandas as pd
#load_files:Load text files with categories as subfolder names.

### Load datasets

In [3]:
train = load_files('dataset/train',encoding='utf-8')

In [4]:
len(train.data)

1000

In [5]:
type(train.data)
#train.data[0]

list

In [6]:
train.target_names

['neg', 'pos']

In [7]:
np.bincount(train.target)

array([500, 500], dtype=int64)

In [8]:
X_train = train.data
y_train = train.target

In [9]:
len(X_train)

1000

In [10]:
test = load_files('dataset/test/',encoding='utf-8')

In [11]:
X_test = test.data
y_test = test.target

In [12]:
len(X_test)

200

### clean data

In [13]:
train.data[2]

'I almost called HBO and demanded my money back for the month just because they\'ve been airing this movie. I can just see the movie execs sitting around going, "Okay, we need to come up with something that\'s just like Home Alone, only we\'ll add a bunch of cash for the kid, hire cut-rate actors, and oh yeah, we\'ll make it a lot less funny!"<br /><br />Okay, maybe not the last part, but that\'s basically what you\'ve got here. Not even worth seeing if someone else rents it. And as a movie for kids? Forget it. I wouldn\'t let my kids see this, not necessarily because of bad-taste jokes, but because I wouldn\'t want them to say, "What were you thinking showing us that lame piece of garbage, Dad?!?!"'

In [14]:
def clean(x):
    #x=re.sub(r'\W',' ',x)
    #x = re.sub(r'[^a-zA-Z]',' ',x)
    
    #to remove html tags
    x = re.sub(r'<.*>', '', x)#? for capturing opening tag of html document

    
    #to remove everything except alpha
    x = re.sub(r'[^a-zA-Z]',' ',x)
    
      
    x = re.sub(r'\s+',' ',x)          #remove extra space's
    return x.lower()
    
#\W:matches any non-alphanumeric character; 
#this is equivalent to the set [^a-zA-Z0-9_].

In [15]:
clean('I hello  ..!  a 123#hi john <html> </>')

'i hello a hi john '

In [16]:
df = pd.DataFrame(X_train,columns=['review'])
df['target'] = y_train
df.head()

Unnamed: 0,review,target
0,I am a huge John Denver fan. I have a large co...,1
1,I just read the plot summary and it is the wor...,1
2,I almost called HBO and demanded my money back...,0
3,"Like his earlier film, ""In a Glass Cage"", Agus...",1
4,There are few films that leave me with the fee...,1


In [17]:
df['review']=df.review.apply(clean)

In [18]:
df.head()

Unnamed: 0,review,target
0,i am a huge john denver fan i have a large col...,1
1,i just read the plot summary and it is the wor...,1
2,i almost called hbo and demanded my money back...,0
3,like his earlier film in a glass cage agust vi...,1
4,there are few films that leave me with the fee...,1


### Convert text into numeric

In [19]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [20]:
cv = TfidfVectorizer(min_df=10,max_df=.6,
                     stop_words='english')

#min_df=10:exclude any word that comes in 10 or less than 10 documents
#max_df=.6:excude any word that comes more than 60% of the documents,

In [21]:
X_new = cv.fit_transform(df.review.values).toarray()

In [22]:
#cv.get_feature_names()

In [23]:
X_new.shape

(1000, 1040)

In [24]:
#cv.get_feature_names()

In [25]:
### use tfidf vectorizor

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [27]:
nb = MultinomialNB()
nb.fit(X_new,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [28]:
log = LogisticRegression()
log.fit(X_new,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [29]:
log.score(X_new,y_train)

0.94

In [30]:
nb.score(X_new,y_train)

0.916

In [31]:
tree = DecisionTreeClassifier()
tree.fit(X_new,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [32]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_new,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [33]:
tree.score(X_new,y_train)

0.999

In [34]:
rf.score(X_new,y_train)

0.999

In [35]:
sv = SVC(kernel='linear',C=.1)
sv.fit(X_new,y_train)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [36]:
sv.score(X_new,y_train)

0.856

### prediction on testing data

In [37]:
df_test = pd.DataFrame(X_test,columns=['review'])
df_test['target'] = y_test

In [38]:
df_test.head()

Unnamed: 0,review,target
0,"Formulaic slasher film, only this one stars th...",0
1,"Yes, I am a romantic of sorts who likes musica...",1
2,I went to an advance screening of this movie t...,1
3,Four things intrigued me as to this film - fir...,0
4,Our family (and the entire sold out sneak prev...,1


In [39]:
df_test['review'] = df_test.review.apply(clean)

In [40]:
X_test_new = cv.transform(df_test.review.values).toarray()

In [41]:
X_test_new.shape

(200, 1040)

In [42]:
log.score(X_test_new,y_test)

0.815

In [43]:
nb.score(X_test_new,y_test)

0.8

In [44]:
tree.score(X_test_new,y_test)

0.675

In [45]:
rf.score(X_test_new,y_test)

0.775

In [46]:
sv.score(X_test_new,y_test)

0.77

### find best value of C parameter in Logistic Regression

In [47]:
for i in [.001,.01,.1,10,100]:
    log1 = LogisticRegression(C=i)
    print('when C:',i)
    print('train:',log1.fit(X_new,y_train).score(X_new,y_train))
    print('test:',log1.score(X_test_new,y_test))
    print()

when C: 0.001
train: 0.875
test: 0.795

when C: 0.01
train: 0.876
test: 0.8

when C: 0.1
train: 0.885
test: 0.795

when C: 10
train: 0.993
test: 0.79

when C: 100
train: 0.999
test: 0.795





In [48]:
log2 = LogisticRegression(C=.1)
print(log2.fit(X_new,y_train).score(X_new,y_train))
print(log2.score(X_test_new,y_test))

0.885
0.795




In [49]:
test=["I do not like this movie","I would not recommend this movie",
     "I hate this movie","I love this movie"]

In [50]:
f=[]
for i in test:
    s=clean(i)
    f.append(s)

In [51]:
f

['i do not like this movie',
 'i would not recommend this movie',
 'i hate this movie',
 'i love this movie']

In [52]:
t=cv.transform(f).toarray()
t

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [53]:

t.shape

(4, 1040)

In [54]:
log2.predict(t)

array([0, 0, 0, 1])

In [55]:
nb.predict(t)

array([0, 1, 1, 1])

In [56]:
tree.predict(t)

array([1, 0, 0, 1])

In [57]:
rf.predict(t)

array([1, 0, 0, 0])

In [58]:
sv.predict(t)

array([0, 0, 0, 1])

### Lets work on 50000 samples

In [62]:
df = pd.read_csv('movie_reviews.csv')

In [63]:
df.head()

Unnamed: 0,review,sentiment
0,This is one of those unfortunate films that su...,1
1,Okay maybe it was because I happen to be in Ya...,1
2,"Although I love this movie, I can barely watch...",1
3,"A man arrives in a strange, beautiful, sterile...",1
4,I'm sitting around going through movie listing...,1


In [64]:
df.shape

(50000, 2)

In [65]:
df.sentiment.value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [66]:
df['review'] = df.review.apply(clean)

In [67]:
from sklearn.model_selection import train_test_split

In [68]:
x_train,x_test,y_train,y_test = train_test_split(df.review.values,
                                                 df.sentiment.values,
                                                test_size=10000,
                                                random_state=10)

In [69]:
x_train.shape

(40000,)

In [70]:
np.bincount(y_test)

array([4962, 5038], dtype=int64)

In [71]:
cv1 = TfidfVectorizer(stop_words='english')


In [72]:
#x_new=cv1.fit_transform(x_train).toarray()
#x_test_new = cv1.transform(x_test).toarray()

x_new=cv1.fit_transform(x_train)
x_test_new = cv1.transform(x_test)

In [73]:
x_new[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [74]:
x_test_new.shape

(10000, 72293)

In [75]:
log3 = LogisticRegression()
log3.fit(x_new,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [76]:
log3.score(x_new,y_train)

0.911475

In [77]:
log3.score(x_test_new,y_test)

0.8596

In [78]:
f

['i do not like this movie',
 'i would not recommend this movie',
 'i hate this movie',
 'i love this movie']

In [79]:
t=cv1.transform(f)

In [80]:
t.shape

(4, 72293)

In [81]:
log3.predict(t)

array([0, 1, 0, 1], dtype=int64)

In [82]:
nb = MultinomialNB()
nb.fit(x_new,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [83]:
nb.predict(t)

array([0, 1, 0, 1], dtype=int64)

### save the model

In [84]:
with open('review_model.pkl','wb') as f1:
    pickle.dump(log3,f1)

In [85]:
### save the vectorizer
with open('cv1.pkl','wb') as f1:
    pickle.dump(cv1,f1)

In [86]:
### load model

In [87]:
with open('review_model.pkl','rb') as f1:
    clf=pickle.load(f1)

In [88]:
clf.predict(t)

array([0, 1, 0, 1], dtype=int64)