# Movie_Review_Classifier_Using_NLP

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

### Reading the data using Pandas

In [2]:
df = pd.read_csv('C:/Users/Abhinav/Desktop/NLP_dataset/moviereviews2.tsv',sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...


In [3]:
len(df)

6000

In [4]:
df.dropna(inplace=True)

In [5]:
len(df)

5980

In [6]:
df.label.value_counts()

neg    2990
pos    2990
Name: label, dtype: int64

In [7]:
df.isnull().sum()

label     0
review    0
dtype: int64

### If any null values or only whitespace present in the data then run this code to remove all null or whitespace review from the database

In [8]:
blank = []
for i,l,rv in df.itertuples():
    if rv.isspace():
        blank.append(i)
blank

[]

In [9]:
df.drop(blank,inplace=True)

In [10]:
len(df)

5980

### Seperating the data into X in Y values

In [11]:
x = df.review
y = df.label

### Split the Data for training and Testing

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=29)

In [13]:
x_train.shape

(4186,)

In [14]:
x_test.shape

(1794,)

### Now Create the classifier

In [15]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

### Training the Classifier

In [16]:
text_clf.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [17]:
y_pred = text_clf.predict(x_test)
y_pred

array(['neg', 'pos', 'pos', ..., 'neg', 'pos', 'neg'], dtype=object)

### Confusion Matrix

In [18]:
pd.DataFrame(confusion_matrix(y_test,y_pred),index=['Pos','Neg'],columns=['Pos','Neg'])

Unnamed: 0,Pos,Neg
Pos,811,56
Neg,47,880


### Classificatition Report

In [19]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         neg       0.95      0.94      0.94       867
         pos       0.94      0.95      0.94       927

    accuracy                           0.94      1794
   macro avg       0.94      0.94      0.94      1794
weighted avg       0.94      0.94      0.94      1794



### Accuracy of the Model

In [20]:
100 * accuracy_score(y_test,y_pred)

94.25863991081383