In [24]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize
import regex as re
import contractions
import string

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
df = pd.read_csv('movie.csv')
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [26]:
def lower_text(review):
    return review.lower()

In [27]:
def initial_clean(review):
    new_review = review.replace('(','')
    new_review = new_review.replace(')','')
    new_review = new_review.replace('"','')
    new_review = new_review.replace(',','')
    new_review = new_review.replace(';','')
    new_review = new_review.replace(':','')
    new_review = new_review.replace('','')
    new_review = new_review.replace('*','')
    new_review = new_review.replace('-',' ')
    new_review = new_review.replace('/',' ')
    new_review = new_review.replace('i.e.',' ')
    new_review = re.sub(r'<[^>]*>',' ',new_review)
    new_review = re.sub('\.{2,}',' ',new_review)
    new_review = re.sub('\?{2,}','?',new_review)
    new_review = re.sub('\!{2,}','!',new_review)
    return new_review

In [28]:
def fix_contractions(review):
    new_review = []
    sentences = sent_tokenize(review)
    for sentence in sentences:
        for word in sentence.split(' '):
            new_review.append(contractions.fix(word))
    return " ".join(new_review)

In [29]:
def final_clean(review):
    new_review = review.replace("'s",'')
    for punc in string.punctuation:
        new_review = new_review.replace(punc,'')
    new_review = re.sub(r'\b\d+(st|nd|rd|th|s)\b','',new_review)
    new_review = re.sub('\d+','',new_review)
    return new_review

In [30]:
def remove_extra_space(review):
    new_review = []
    for word in review.split(' '):
        if len(word)!=0:
            new_review.append(word.strip())
    return " ".join(new_review) 

In [31]:
stopwords_dict = {word:1 for word in stopwords.words('english')}

def remove_stopwords(review):
    new_review = ""
    for word in review.split(' '):
        if word not in stopwords_dict:
            new_review += word + " "
    return new_review

In [32]:
lemma = WordNetLemmatizer()

def lemmatize_review(review):
    new_review = ""
    for word in review.split(' '):
        new_review += lemma.lemmatize(word) + " "
    return new_review.strip()

In [33]:
df['text'] = (df['text'].apply(lower_text)
                .apply(initial_clean)
                .apply(fix_contractions)
                .apply(final_clean)
                .apply(remove_extra_space)
                .apply(remove_stopwords)
                .apply(lemmatize_review)
                )

In [34]:
df.head()

Unnamed: 0,text,label
0,grew b watching loving thunderbird mate school...,0
1,put movie dvd player sat coke chip expectation...,0
2,people know particular time past like feel nee...,0
3,even though great interest biblical movie bore...,0
4,die hard dad army fan nothing ever change got ...,1


In [35]:
df.to_csv('clean.csv',index = False)