In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('../TextFiles/moviereviews2.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...


In [2]:
df.isnull().sum()

label      0
review    20
dtype: int64

In [3]:
#Check for blanks
blanks=[]

for i,lb,rv in df.itertuples():
    if type(rv)==str:
        if rv.isspace():
            blanks.append(i)
            

In [4]:
len(blanks)

0

In [5]:
#remove null values
df=df.dropna()

In [6]:
df['label'].value_counts()

pos    2990
neg    2990
Name: label, dtype: int64

In [7]:
#Text cleaning
import string
import re

In [22]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [23]:
stop_words=set(stopwords.words('english'))
stop_words.discard('not')


In [27]:
wordnet=WordNetLemmatizer()

In [96]:
#This function converts to lower-case, removes square bracket, removes numbers and punctuation
def text_clean(text):
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('<[^>]+>','',text)   #remove html tags like <br>
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\w*\d\w*','',text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    #text=text.split()
    #text = [wordnet.lemmatize(word) for word in text if not word in stop_words]
    #text = ' '.join(text)
    return text
    

In [97]:
df['cleaned_review']=pd.DataFrame(df['review'].apply(lambda x:text_clean(x)))

In [87]:
df.iloc[1]['review']

"A warm, touching movie that has a fantasy-like quality.<br /><br />Ellen Burstyn is, as always, superb.<br /><br />Samantha Mathis has given many great performances, but there is just something about this one will haunt your memory.<br /><br />Most of all, you've got to see this amazing 5-yr. old, Jodelle Ferland. I was so captivated by her presence, I had to buy the movie so I could watch her again and again. She is a miracle of God's creation.<br /><br />Judging by the high IMDB rating, I'm not the only one who was mesmerized by this young actress."

In [98]:
df.iloc[1]['cleaned_review']

'a warm touching movie that has a fantasylike qualityellen burstyn is as always superbsamantha mathis has given many great performances but there is just something about this one will haunt your memorymost of all youve got to see this amazing  old jodelle ferland i was so captivated by her presence i had to buy the movie so i could watch her again and again she is a miracle of gods creationjudging by the high imdb rating im not the only one who was mesmerized by this young actress'

In [99]:
df.head()

Unnamed: 0,label,review,cleaned_review
0,pos,I loved this movie and will watch it again. Or...,i loved this movie and will watch it again ori...
1,pos,"A warm, touching movie that has a fantasy-like...",a warm touching movie that has a fantasylike q...
2,pos,I was not expecting the powerful filmmaking ex...,i was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha...",this socalled documentary tries to tell that u...
4,pos,This show has been my escape from reality for ...,this show has been my escape from reality for ...


In [100]:
from sklearn.model_selection import train_test_split

X = df['cleaned_review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [101]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [102]:
text_clf=Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [103]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [104]:
predictions = text_clf.predict(X_test)

In [105]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predictions))

[[900  91]
 [ 61 922]]


In [84]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.94      0.91      0.92       991
         pos       0.91      0.94      0.92       983

    accuracy                           0.92      1974
   macro avg       0.92      0.92      0.92      1974
weighted avg       0.92      0.92      0.92      1974



In [107]:
re.sub('\w*\d\w*','','Hellp how 34 y45')

'Hellp how  y'