## importing the packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  import pandas._libs.tslibs.offsets as liboffsets
  from pandas._libs import algos as libalgos, ops as libops
  from pandas._libs.interval import (
  from pandas._libs import internals as libinternals
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import (lib, reduction,
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from pandas._libs import algos, lib, writers as libwriters
  from .murmurhash import murmurhash3_32
  from ._logistic_sigmoid import _log_logistic_sigmoid
  from .sparsefuncs_fast import csr_row_norms
  from .expected_mutual_info_fast import expected_mutual_information
  from .pairwise_fast im

## data preprocessing

In [2]:
df = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
df.dropna(inplace=True)
df["Postive rated"]=np.where(df['sentiment']>0,1,0)
#df.loc[df["Postive rated"] == 1,"Postive rated"]='good feedback'
#df.loc[df["Postive rated"] == 0,"Postive rated"]='bad feedback'
X_train, X_test, y_train, y_test = train_test_split(df['review'],df['Postive rated'],random_state=0)
df.head(10)

Unnamed: 0,id,sentiment,review,Postive rated
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",1
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",1
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",0
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",0
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",1
5,"""8196_8""",1,"""I dont know why people think this is such a b...",1
6,"""7166_2""",0,"""This movie could have been very good, but com...",0
7,"""10633_1""",0,"""I watched this video at a friend's house. I'm...",0
8,"""319_1""",0,"""A friend of mine bought this film for £1, and...",0
9,"""8713_10""",1,"""<br /><br />This movie is full of references....",1


## converting text into number's using CountVectorizer

In [16]:
vect=CountVectorizer(min_df=5,ngram_range=(1,2)).fit(X_train)
X_train_vetorised=vect.transform(X_train)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

## model fitting

In [4]:
model=LogisticRegression()

model.fit(X_train_vetorised,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## model prediction

In [15]:
predictions=model.predict(vect.transform(X_test))

## Accuracy of the model

In [6]:
print ("AUC:",roc_auc_score(y_test,predictions))

('AUC:', 0.8982674883317247)


## coefficient or weight 

In [9]:
feature_name=np.array(vect.get_feature_names())

sort_coeff=model.coef_[0].argsort()

print ("small coeff : {}",format(feature_name[sort_coeff[:10]]))

print ("large coeff : {}",format(feature_name[sort_coeff[:-12:-1]]))

('small coeff : {}', "[u'worst' u'awful' u'waste' u'boring' u'disappointment' u'poor'\n u'disappointing' u'worse' u'poorly' u'the worst']")
('large coeff : {}', "[u'excellent' u'perfect' u'wonderful' u'enjoyable' u'surprisingly'\n u'amazing' u'superb' u'believable' u'today' u'great' u'enjoyed']")


## user manual testing

In [8]:
testing=raw_input("Enter the sentence for testing: ")
y_hat = (model.predict(vect.transform([testing])))
if y_hat == [1]:
    print "Feedback Positive: GOOD MOVIE 😍😍😍😍 👏👌 "
else:
    print "Feedback Negative: BAD MOVIE 😟😏😠😤 👎👎👎"

Enter the sentence for testing: Making a spoof needs lot of courage ,confidence and creativity . Fantastic Initiative by Director CS Amuthan & Team 
Feedback Positive: GOOD MOVIE 😍😍😍😍 👏👌 
