In [19]:
import re
import nltk
import string
import itertools
import pandas as pd
import nltk
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

from nltk import pos_tag, ne_chunk

import warnings

warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to /Users/ali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/ali/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ali/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/ali/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


In [2]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

1. Read the data using pandas 
2. Keep text and sentiment and drop the other columns
3. Tokenize
4. Remove punctuations
5. **Remove numbers**
6. Remove words of 1 or 2 characters
7. Remove stop words
8. Convert to lower case
9. Use stemmer
10. Create Bag of Words
11. Compute TF
12. Put a threshold for TF and retain only those words above the threshold TF
13. Finally u will get the Term Document Matrix. This will be the independent variable set: They will be the X variables
14. Use the sentiment column as the label: y
15. X & y will be a supervised classification problem
16. Use any classification algorithm like random forest for classification.

## Read the data using pandas

In [4]:
# n_samples = 100
n_samples = 2_000
# read data
data = pd.read_csv('data/imdb_master.csv')
# remove unsupervised data
data=data[data.label != 'unsup']
# take a sample from the data as df
df= data.sample(n_samples, random_state=0)

## Keep text and sentiment and drop the other columns

In [5]:
df = df.drop(['Unnamed: 0', 'file', 'type'], axis=1)
df.columns = ['review', 'sentiment']
df=df[df.sentiment != 'unsup']

# n_samples = 2_000
# n_samples = 100
# df = data.sample(n_samples, random_state=0)

df.sentiment.value_counts()

neg    1015
pos     985
Name: sentiment, dtype: int64

## Tokenize

In [6]:
def tokenize_text(text):
    
    #Tokenize words
    tokens = nltk.word_tokenize(text)
    
    return tokens

## Remove punctuations

In [7]:
def normailze_text(text):
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra characters
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    
    # Remove punctuation characters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text) 
    
    # Remove symbols
    text = re.sub(r'[^A-Za-z\s]',r'',text)
    text = re.sub(r'\n',r'',text)
    # Remove two characters
    resulst =[]
    for i in text.split(' '):
        resulst.append(re.sub(r'^\w{0,2}$',r'',i))
    text = ' '.join(resulst) 
    return text

In [8]:
def remove_stopwords(tokens):
    
    stop_words = stopwords.words('english')
    token_list = []
    
    for word in tokens:
        if not word in stop_words:
            token_list.append(word)
            
    
    return token_list

## Use stemmer

In [9]:
def stem_lem_words(tokens):
    
#     # Stemming tokens
#     tokens = [stemmer.stem(token) for token in tokens]
    
#     print('\nStemming Output:\n')
#     print(tokens)
    
    #Lemmatizing tokens
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]

    
    return tokens

In [10]:
df['result'] = df.review.apply(normailze_text)
df['result'] = df.result.apply(tokenize_text)
df['result'] = df.result.apply(remove_stopwords)
df['result'] = df.result.apply(stem_lem_words)


In [11]:
df

Unnamed: 0,review,sentiment,result
11841,Al Pacino was once an actor capable of making ...,neg,"[pacino, actor, capable, make, role, work, wit..."
19602,"If you read the book by Carl Hiaasen, the movi...",pos,"[read, book, carl, hiaasen, movie, follow, pre..."
45519,This movie is sort of a Carrie meets Heavy Met...,pos,"[movie, sort, carrie, meet, heavy, metal, high..."
25747,This movie was like a bad indie with A-list ta...,neg,"[movie, like, bad, indie, list, talent, plot, ..."
42642,"In the '70s, Charlton Heston starred in sci-fi...",pos,"[charlton, heston, star, sci, flick, vary, qua..."
...,...,...,...
6087,Just had the misfortune to see this truly awfu...,neg,"[misfortune, see, truly, awful, film, think, s..."
27269,To confess having fantasies about Brad Pitt is...,neg,"[confess, fantasy, brad, pitt, pretty, tough, ..."
455,"Where do I begin, its one of the most frustrat...",neg,"[begin, one, frustrate, movies, see, make, lot..."
11984,"I like bad films, but this thing is a steaming...",neg,"[like, bad, film, thing, steam, heap, shaky, c..."


In [15]:
def listToString(s): 
    
    # initialize an empty string
    str1 = " " 
    # return string  
    return (str1.join(s))

In [20]:
vectorizer = CountVectorizer()
bowmatrix = vectorizer.fit_transform(df["result"].apply(listToString))

In [21]:
new_df = pd.DataFrame(bowmatrix.toarray(), columns=vectorizer.get_feature_names()) # TERM DOCUEMNT MATRIX

In [22]:
new_df

Unnamed: 0,aadha,aah,aaliyah,aames,aamir,aap,aapke,aardman,aargh,aaron,...,zonked,zoo,zoog,zoolander,zoom,zoot,zora,zucker,zuckerman,zulu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
pca_6 = PCA()
pca_6.fit(new_df)
np.cumsum(pca_6.explained_variance_ratio_*100)

array([  5.382434  ,   9.23344502,  11.04729668, ..., 100.        ,
       100.        , 100.        ])