## Text Classification
### Building a model that will classify movie reviews as positive or negative.

#### Performing imports and loading a dataset into a pandas DataFrame

In [6]:
import numpy as np
import pandas as pd

df = pd.read_csv("TextFiles/moviereviews2.tsv", sep = "\t" )

#### Checking for missing values

In [8]:
sum(df['label'].isnull())
df.drop(df[df['review'].isnull()].index, inplace=True)
df

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...
...,...,...
5995,pos,"Of the three remakes of this plot, I like them..."
5996,neg,Poor Whoopi Goldberg. Imagine her at a friend'...
5997,neg,"Honestly before I watched this movie, I had he..."
5998,pos,This movie is essentially shot on a hand held ...


#### Checking for whitespace strings

In [9]:
to_drop = []
for index, label, review, in df.itertuples():
    if(type(review)==str and len(review.strip())==0):
        to_drop.append(i)
df.drop(to_drop, inplace=True)

#### Removing NaN values

In [10]:
df.dropna(inplace=True)

#### Taking a quick look at the `label` column

In [11]:
df['label']

0       pos
1       pos
2       pos
3       neg
4       pos
       ... 
5995    pos
5996    neg
5997    neg
5998    pos
5999    pos
Name: label, Length: 5980, dtype: object

#### Splitting the data into train & test sets

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['review'], df['label'], test_size = 0.33, random_state = 42)

#### Building a pipeline to vectorize the date, then training and fitting a model

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer


pipeline = Pipeline([('tftdf', TfidfVectorizer()),
                    ('clf', LinearSVC())])


pipeline.fit(x_train, y_train)


Pipeline(memory=None,
         steps=[('tftdf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

#### Running predictions and analyzing the results

In [14]:
predictions = pipeline.predict(x_test)

In [15]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[900  91]
 [ 63 920]]
              precision    recall  f1-score   support

         neg       0.93      0.91      0.92       991
         pos       0.91      0.94      0.92       983

    accuracy                           0.92      1974
   macro avg       0.92      0.92      0.92      1974
weighted avg       0.92      0.92      0.92      1974



In [16]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,predictions))

0.9219858156028369
