### Tasks:
- pre-process features in different ways, as in the previous task:  
    - lower case, stemmer (porter, ours)
    - replace (NER)
    - compute TF-IDF / w2v (1, 2, .. n-grams)
    - filter most important terms/reduce vector size
- compare precision, recall accuracy of classifiers for vector sizes
- modify parameters of classifiers
- fold cross validation (using different training/testing) sets
- document and report in a markup cell

###### Project I have done in few lines:
1. In Preprocessing:
    - For stemmer I used both porter and ours. I found ours is good so using only it. 
    - To replace NER and do lemmatization I used spacy and NLTK. I found spacy is good.
    - I did either stemming or lemmatization separately, to find how this preprocessing effects vector size.
    - I computed TF-IDF using TfidfVectorizer.
    - I filterd most important words by removing stopwords,punctuations and using min_df parameter in TfidfVectorizer.
    - I reduced vector size using latent sematic analysis.
2. I compared precision,recall,accuracy,F1 score of both positive and negative vectors for all 4 classifiers
3. I modified parameters of classifiers and did hyperparameter optimization using RandomizedSearchCV and GridSearchCV to find the optimal set of hyperparameters for a given model.
4. I ran data split based on train_test_split multiple times to see the performance each time.  
5. I did Stratified K-Fold cross validation to have different training and test sets and evaluated performance of all 4 classifiers.

In [99]:
import os
import re
import string
import spacy
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


import nltk
from nltk.tokenize import word_tokenize
from nltk import ne_chunk, pos_tag
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tree import Tree


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report

# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB


from collections import Counter


In [100]:
#our stemmer 
def myStemmerDic(tok, pos, dic):
    stem = tok.lower()

    # Check in the exception dictionary first
    if stem in dic and pos in dic[stem]:
        stem = dic[stem][pos]
    
    elif(pos == "VBG"):                      # Verb forms
        stem = re.sub(r'ing$', '', stem)
    elif(pos == "VBD"):
        stem = re.sub(r'ed$', '', stem)
    elif(pos == "VBN"):
        stem = re.sub(r'ed$', '', stem)
    elif(pos == "VBZ"):
        stem = re.sub(r'(es|s)$', '', stem)
        
    elif(pos == "NNS"):                      # Noun forms
        stem = re.sub(r'(s|ies)$', '', stem)
        
    elif(pos == "JJR"):                      # Adjective forms
        stem = re.sub(r'(er)$', '', stem)               
        
    elif(pos == "JJS"):
        stem = re.sub(r'(est)$', '', stem)
        
    elif(pos == "RBR"):                      # Adverb forms
        stem = re.sub(r'(er)$', '', stem)               
        
    elif(pos == "RBS"):
        stem = re.sub(r'(est)$', '', stem)                
        
    elif(pos == "FW"):                       # Foreign words
        stem = re.sub(r'(o)$', '', stem)     
        
        
    return (tok, stem, pos)

# exception dictionary
dico = {
    "is": {"VBZ": "be"},
    "was": {"VBD": "be"},
    "were": {"VBD": "be"},
    "been": {"VBN": "be"},
    "'m": {"VBP": "am"},
    "'ve": {"VBP": "have"},
    "n't": {"RB": "not"},
    "met": {"VBD": "meet"},
    "said": {"VBD": "say"},
    "'ll": {"MD": "be"},        #modal
    "'s":{"POS": "be"},         #Possessive ending
    "thought": {"VBD": "think"},
    "has": {"VBZ": "have"},
    "him": {"PRP": "he"},
    "his": {"PRP$": "he"},
    "children": {"NNS": "child"},
    "mice": {"NNS": "mouse"},
    "goes" : {"VBD" : "go"},
    "died": {"VBD": "die"},
    "had": {"VBD": "have"},
    "feet": {"NNS": "foot"},
    "teeth": {"NNS": "tooth"},
    "geese": {"NNS": "goose"},
    "capturing": {"VBG": "capture"},
    "sacrificing": {"VBG": "sacrifice"},
    "believing": {"VBG": "believe"},
    "took": {"VBD": "take"},
    "saw": {"VBD": "see"},
    "gone": {"VBN": "go"},
    "ate": {"VBD": "eat"},
    "drank": {"VBD": "drink"},
    "loved": {"VBN": "love"},
    
    # Archaic forms and irregulars
    "thou": {"PRP": "you"},
    "art": {"VBP": "are"},
    "hath": {"VBZ": "have"},
    "doth": {"VBZ": "do"},
    "thy": {"PRP$": "your"},
    "thee": {"PRP": "you"},


    # Irregular plurals
    "brethren": {"NNS": "brother"},
    "oxen": {"NNS": "ox"},
    "indices": {"NNS": "index"},
    "appendices": {"NNS": "appendix"}
       
}


In [101]:
#stemming 
def stemming(sent):
    
    #stemming with our stemmer
    gstem = [myStemmerDic(tok, tag, dico) for tok, tag in nltk.pos_tag(sent)]
    stems_ours = [stem for tok, stem, tag in gstem]
    
    #stemming with porter stemmer 
    #porter   = PorterStemmer()
    #stems_porter = [porter.stem(w) for w in sent]
   
    #return stems_porter
    return stems_ours

In [102]:
#lemmatization using NLTK

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    


def lemmatize_nltk(words):
    lemmas =[]
    lemmatizer = WordNetLemmatizer()
    for word, tag in nltk.pos_tag(words):
        wntag = get_wordnet_pos(tag)
        if wntag is None:    # no supply of tag in case of None
            lemma = lemmatizer.lemmatize(word) 
        else:
            lemma = lemmatizer.lemmatize(word, pos=wntag) 
        lemmas.append(lemma)

    return lemmas

In [103]:
#NER using NLTK

def NER_NLTK(word):
    
    tokens = word_tokenize(word)
    tagged_tokens = pos_tag(tokens)
    chunked_ner = ne_chunk(tagged_tokens)            #apply ner chunking

    processed_tokens = []

    for chunk in chunked_ner:
        if isinstance(chunk, Tree):
            ner_label = chunk.label()  # Get the named entity label
            processed_tokens.extend([ner_label] * len(chunk))  # Extend the list with the label, repeated for each token in the chunk
        else:
            word = chunk[0]   # If it's not an NE, just append the word,Chunk format is (word, POS_tag)
            processed_tokens.append(word)
    return processed_tokens



In [104]:
#spacy for NER and lemmatization


def spacy_ner(words,flag=0):
    # Load the spaCy model
    nlp = spacy.load("en_core_web_sm")
    
    docs = ' '.join([items for items in words])

    # Process the text (join your tokens into a single string)
    doc = nlp(docs)

    processed_tokens = []
    if flag == 0:
        for token in doc:
            if token.ent_type_:
                processed_tokens.append(token.ent_type_)  #replace with NER 
            else:
                processed_tokens.append(token.lemma_)  # replace with lemma for non-entity tokens
    if flag == 1:
        for token in doc:
            if token.ent_type_:
                processed_tokens.append(token.ent_type_)  #replace with NER 
            else:
                processed_tokens.append(token.text)  # replace with text for non-entity tokens
    return processed_tokens


In [105]:
#filter most important terms

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    
    # Delete function words|Remove stop words
    filtered_tokens = [word for word in words if word not in stop_words]

    return filtered_tokens



def remove_punctuations(texts):
    
    text1 = ' '.join(texts)
    tokens = word_tokenize(text1)
    
    # Filter out tokens that are entirely punctuation
    cleaned_tokens = " ".join([token for token in tokens if token not in string.punctuation])
    
    #removing custom punctuations not removed previously
    cleaned_tokens = re.sub(("[-`]")," ",cleaned_tokens)
    
    token = word_tokenize(cleaned_tokens)
    
    #removing br 
    cleaned_texts = [text for text in token if text != "br"]

    return cleaned_texts

In [106]:
# resize vectors by using latent sematic analysis

def resizeVectors(M, size = 100):   
    svd = TruncatedSVD(n_components = size)
    return svd.fit_transform(M)

In [107]:
#preprocessing 

def preprocessSent(sent):
    
    #using spacy for NER and lemmatization
    data_lowercase = sent.lower()             #converting to lower case
    
    tokens = word_tokenize(data_lowercase)     
    
    filtered_data1 = remove_stopwords(tokens)     #removing stopwords
    
    ner = spacy_ner(filtered_data1,0)      #performing lemmatization and NER using spacy
    
    #ner = spacy_ner(filtered_data1,1)      #performing just NER using spacy

    #stemmed_data = stemming(ner)       #stemming 
    
    #filtered_data2 = remove_punctuations(stemmed_data)    #removing punctuations
    
    filtered_data2 = remove_punctuations(ner)    #removing punctuations
    
    processed_text = ' '.join(filtered_data2)
    
    return processed_text
    """ 
    #using NLTK for NER and lemmatization
    replace_NER = NER_NLTK(sent)       #performing NER using NLTK
    
    filtered_docs = ' '.join([items for items in replace_NER])
    
    #lemmatized_data = lemmatize_nltk(replace_NER)    #performing lemmatization using NLTK
    
    #filtered_docs = ' '.join([items for items in lemmatized_data])
    
    data_lowercase = filtered_docs.lower()
    
    tokens = word_tokenize(data_lowercase)
    
    filtered_data1 = remove_stopwords(tokens)     #removing stopwords
    
    stemmed_data = stemming(filtered_data1)      #stemming
    
    filtered_data2 = remove_punctuations(stemmed_data)     #removing punctuations
    
    #filtered_data2 = remove_punctuations(filtered_data1)     #removing punctuations
     
    processed_text = ' '.join(filtered_data2)
    
    return processed_text
    """

In [25]:
def ReadSourceTok(dic, n=100,  tag = False, verbose = 0) :
    D = {}
    i = 0

    # Read sorted file names
    for f in sorted(Path(dic).iterdir()):
        if(verbose == 1): print(f.resolve())
        if (i == n): break
        i += 1
        
        with f.open('r', encoding='utf-8') as fhin: data = fhin.read()
            
        # get the file basename as index for document
        b = os.path.basename(f).split(".")[0]
        
        # document is a string of tokens
        D[b] = preprocessSent(data)
    return D

# number of docs to read
nDocs = 100

#initialize 
D1 = N1 = P1 = {}

# Read positive documents 
P1 = ReadSourceTok("/data/critt/shared/resources/aclImdb/test/pos/", n=nDocs, tag=False)

# Read negative documents 
N1 = ReadSourceTok("/data/critt/shared/resources/aclImdb/test/neg/", n=nDocs, tag=False)

# join the Pos and the Neg corpus
D1 = {**P1,**N1}

print(f"#Pos:{len(P1)} #words in Docs:{len([w for d in P1.keys() for s in P1[d] for w in s])}")
print(f"#Neg:{len(N1)} #words in Docs:{len([w for d in N1.keys() for s in N1[d] for w in s])}")
print(f"#Sum:{len(D1)} #words in Docs:{len([w for d in D1.keys() for s in D1[d] for w in s])}")


# tfidf vector 1 - 3 grams
tfidf = TfidfVectorizer(ngram_range=(1, 3),min_df=2)  

# Learn vocabulary and idf from P1 and N1 documents
tfidf.fit(D1.values())     #produces length of vector

print(f"Length of Tfidf vectors: {len(tfidf.get_feature_names())}")

 
# Transform P1 documents to document-term matrix.
Pos1 = tfidf.transform(P1.values())

# Transform N1 documents to document-term matrix.
Neg1 = tfidf.transform(N1.values())

# Transform Pos1 and Neg1 documents to document-term matrix.
Ptr1 = Pos1.toarray()
Ntr1 = Neg1.toarray()

print(f"Docs vector after tfidf: #Pos:{Ptr1.shape} #Neg:{Ntr1.shape} ")

# Applying Latent semantic analysis
lsa_Pos = resizeVectors(Pos1,100)
lsa_Neg = resizeVectors(Neg1,100)

print(f"Docs vector after reducing size & tfidf: #Pos:{lsa_Pos.shape} #Neg:{lsa_Neg.shape} ")


#Pos:100 #words in Docs:91702
#Neg:100 #words in Docs:74529
#Sum:200 #words in Docs:166231
Length of Tfidf vectors: 3967
Docs vector after tfidf: #Pos:(100, 3967) #Neg:(100, 3967) 
Docs vector after reducing size & tfidf: #Pos:(100, 100) #Neg:(100, 100) 


### Observations

- Using TfidfVectorizer rather than CountVectorizer because TfidfVectorizer gives importance not only  to raw frequency of terms but also their relevance to the document.
- min_df=2 in TfidfVectorizer includes terms that appear in at least 2 documents thus considering most important terms.
- Used latent sematic analysis to reduce vector size because PCA doesnot work on sparse data and giving Valueerro
- Vectors after tfidf are #Pos:(100, 4135) #Neg:(100, 4135). When vector size reduced using latent sematic analysis number of vectors is #Pos:(100, 100) #Neg:(100, 100) 
- the output from fit_transform will be (n_samples, n_components), where n_samples is the number of samples (or documents) and n_components is the number of dimensions.
   - here, #Pos:(100, 100) #Neg:(100, 100) means 100 samples with 100 dimensions

###### For 100 documents,

- If preprocessing and vector size reduction is not done.
    - #Pos:100 #words in Docs:151077
    - #Neg:100 #words in Docs:125107
    - #Sum:200 #words in Docs:276184
    - Length of Tfidf vectors: 78851
    
    
- In preprocessing did spacy(NER,lemmatization),filter most important terms and reducing vector size
    - #Pos:100 #words in Docs:91702
    - #Neg:100 #words in Docs:74529
    - #Sum:200 #words in Docs:166231
    - Length of Tfidf vectors:3967    
    
    
- In preprocessing did spacy(NER),stemming(ours),filter most important terms and reducing vector size
    - #Pos:100 #words in Docs:91115
    - #Neg:100 #words in Docs:74090
    - #Sum:200 #words in Docs:165205
    - Length of Tfidf vectors:4136 
    

- In preprocessing did NLTK(NER,lemmatization),filter most important terms and reducing vector size
    - #Pos:100 #words in Docs:92840
    - #Neg:100 #words in Docs:75188
    - #Sum:200 #words in Docs:168028
    - Length of Tfidf vectors:3996
  
  
- In preprocessing did NLTK(NER),stemming(ours),filter most important terms and reducing vector size
    - #Pos:100 #words in Docs:92102
    - #Neg:100 #words in Docs:74515
    - #Sum:200 #words in Docs:166617
    - Length of Tfidf vectors:4107
   
   
 - Out of all (In preprocessing did spacy(NER,lemmatization),filter most important terms and reducing vector size) is giving small number of tfidf vectors 3967.
 - Lemmatization(either by NLTK-3996 or Spacy-3967) gave less number of tfidf vectors than stemming(eithrt by NLTK-4107 or Spacy-4136)

In [108]:
TrainVecPos = pd.DataFrame(lsa_Pos)
TrainVecPos["Label"] = 1
#TrainVecPos["Doc"] = [d for d in P1]

TrainVecNeg = pd.DataFrame(lsa_Neg)
TrainVecNeg["Label"] = 0
#TrainVecNeg["Doc"] = [d for d in N1]

# merge dataset
TrainVecSet3 = pd.concat([TrainVecPos, TrainVecNeg], axis=0)

# Y is label: {1,0}
Y = TrainVecSet3[['Label']]

# X is everything without label
X = TrainVecSet3.drop(['Label'], 1)


TrainVecSet3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Label
0,0.198724,-0.174723,-0.046560,-0.123015,-0.049170,0.118192,-0.034534,-0.032467,0.008728,0.078233,...,0.006468,-0.026142,-0.000777,-0.003682,0.003469,0.014366,-0.002095,0.003779,1.734723e-18,1
1,0.244216,-0.147893,-0.176419,0.065491,0.059865,-0.122240,0.242151,-0.057371,-0.111875,0.075092,...,0.037450,-0.012131,-0.031259,-0.016767,-0.011999,0.007542,-0.014171,-0.000992,5.854692e-17,1
2,0.206636,-0.085652,-0.130101,0.124288,0.032111,-0.125754,0.363288,0.023746,-0.110371,-0.017236,...,0.009186,0.029322,-0.213395,0.068549,0.000651,-0.005848,-0.003823,-0.013296,1.027824e-16,1
3,0.199278,-0.175578,-0.139687,-0.111703,-0.020737,0.031985,0.149586,-0.162077,-0.089015,0.025001,...,-0.012831,0.031061,0.010201,0.025158,-0.004957,-0.004915,0.004010,-0.024305,-2.679335e-17,1
4,0.267070,-0.143570,-0.109298,0.171315,0.074242,-0.089133,0.469048,-0.009555,-0.098383,0.012951,...,0.007843,0.053236,-0.152597,0.019948,-0.001764,-0.010894,0.006407,-0.016624,-1.184762e-16,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.165220,0.025094,-0.168507,0.216939,-0.176450,-0.035032,-0.030426,-0.100123,0.259880,-0.231463,...,-0.030119,0.084340,-0.171010,0.097927,0.020588,-0.186042,0.162486,-0.196609,-4.230572e-03,0
96,0.212398,-0.093532,-0.188379,0.172079,-0.152760,0.012334,-0.107653,-0.113433,0.134218,-0.132996,...,0.013144,0.002013,-0.022474,-0.082154,0.047798,-0.080510,0.029634,0.006300,9.271991e-04,0
97,0.239396,-0.114906,0.043967,-0.169853,0.118497,0.000682,-0.061852,0.100628,0.358114,0.102758,...,0.032880,0.062694,0.001607,0.029284,0.007638,0.064021,0.008165,0.089763,-6.240473e-04,0
98,0.196572,-0.135702,-0.068218,-0.039095,-0.043260,0.058507,0.012039,-0.070316,-0.023330,-0.160980,...,0.035166,-0.026989,-0.035054,0.017161,0.010305,0.004800,0.055054,0.006673,-3.242634e-03,0


In [109]:
# extracting training and test set 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# extract train and test set first
trainX, testX, trainY, testY = train_test_split(X, Y, test_size = .25)

# Fit the scaler on the training data
scaler = StandardScaler()   #standardizes features by removing the mean and scaling to unit variance.
scaler.fit(trainX)         #computes the mean and standard deviation

# scale both the training and test data
# performing the standardization by centering and scaling based on the mean and standard deviation 
trainX_scaled = scaler.transform(trainX)
testX_scaled = scaler.transform(testX)

print(f"Train:{trainX_scaled.shape} Labels Train: {trainY.shape}\tTest:{testX_scaled.shape} Labels Test: {testY.shape} ")


Train:(150, 100) Labels Train: (150, 1)	Test:(50, 100) Labels Test: (50, 1) 


If scaling is done on the entire dataset before splitting, you are not preventing data leakage from the test set 
into the training process. This means that information from the test set can influence the scaling parameters
(mean and standard deviation)which is a mistake.The test set should be completely unseen data to simulate 
real-world performance accurately.Following is the wrong process:

scaler = StandardScaler()  
scaler.fit(X)
X_scale = scaler.transform(X)  
trainX, testX, trainY, testY = train_test_split(X_scale, Y, test_size = .25)

## Naive Bayesian Classifier

In [110]:

# Create a Gaussian Classifier
NBmodel = GaussianNB(var_smoothing=1e-9)

# Train the model using the training sets
NBmodel.fit(trainX_scaled, trainY["Label"])

#Predict Output
Y_Bayes = NBmodel.predict(testX_scaled)

target_names = ['Neg', 'Pos']
# Collect metrics
report_nb = classification_report(testY["Label"], Y_Bayes, target_names=target_names, output_dict=True)


## Random Forest

In [111]:

# Instantiate model with 200 decision trees
rf = RandomForestClassifier(
    n_estimators=200,   # Number of trees
    criterion='gini',   # Splitting criterion
    max_depth=20,     # Maximum depth of the tree
    max_features='sqrt', # Number of features to consider for the best split
    bootstrap = False   #consider whole database or not
)

# Train the model on training data
rf.fit(trainX_scaled, trainY["Label"]);

# Use the forest's predict method on the test data
Y_rf = rf.predict(testX_scaled)

target_names = ['Neg', 'Pos']
report_rf = classification_report(testY["Label"], Y_rf, target_names=target_names, output_dict=True)


## Support Vector Machine (SVM)

In [112]:

#Create a svm Classifier
clf = svm.SVC(         
    kernel='linear',      # Specifies the kernel type to be used
    gamma="scale"       # Kernel coefficient
  ) 

#Train the model using the training sets
clf.fit(trainX_scaled, trainY["Label"])

#Predict the response for test dataset
Y_clf = clf.predict(testX_scaled)

target_names = ['Neg', 'Pos']
report_clf = classification_report(testY["Label"], Y_clf, target_names=target_names, output_dict = True)


## MLP

In [113]:

# instatiate classifier
mlp = MLPClassifier(
    solver='adam',          # Solver for weight optimization
    activation='tanh',       # Activation function
    alpha=0.01,               # L2 penalty parameter
    hidden_layer_sizes=(50,),  # Size of the hidden layer
    random_state=1,
    max_iter=1000,
    learning_rate='adaptive',
    early_stopping=True
)
mlp.fit(trainX_scaled, trainY["Label"])

pred_Y = mlp.predict(testX_scaled)

target_names = ['Neg', 'Pos']
report_mlp = classification_report(testY["Label"], pred_Y, target_names=target_names,output_dict = True)


In [117]:
import pandas as pd
from IPython.display import display, Markdown

# Prepare data for DataFrame
data = {
    "Classifier": ["Naive Bayes", "Random Forest", "SVM", "MLP"],
    "Accuracy": [report_nb["accuracy"], report_rf["accuracy"], report_clf["accuracy"], report_mlp["accuracy"]],
    "Precision (Neg)": [report_nb["Neg"]["precision"], report_rf["Neg"]["precision"], report_clf["Neg"]["precision"], report_mlp["Neg"]["precision"]],
    "Recall (Neg)": [report_nb["Neg"]["recall"], report_rf["Neg"]["recall"], report_clf["Neg"]["recall"], report_mlp["Neg"]["recall"]],
    "F1-Score (Neg)": [report_nb["Neg"]["f1-score"], report_rf["Neg"]["f1-score"], report_clf["Neg"]["f1-score"], report_mlp["Neg"]["f1-score"]],
    "Precision (Pos)": [report_nb["Pos"]["precision"], report_rf["Pos"]["precision"], report_clf["Pos"]["precision"], report_mlp["Pos"]["precision"]],
    "Recall (Pos)": [report_nb["Pos"]["recall"], report_rf["Pos"]["recall"], report_clf["Pos"]["recall"], report_mlp["Pos"]["recall"]],
    "F1-Score (Pos)": [report_nb["Pos"]["f1-score"], report_rf["Pos"]["f1-score"], report_clf["Pos"]["f1-score"], report_mlp["Pos"]["f1-score"]]
}

# Create DataFrame
comparison_df = pd.DataFrame(data)

comparison_df


Unnamed: 0,Classifier,Accuracy,Precision (Neg),Recall (Neg),F1-Score (Neg),Precision (Pos),Recall (Pos),F1-Score (Pos)
0,Naive Bayes,0.96,0.923077,1.0,0.96,1.0,0.923077,0.96
1,Random Forest,0.86,0.814815,0.916667,0.862745,0.913043,0.807692,0.857143
2,SVM,0.36,0.3,0.25,0.272727,0.4,0.461538,0.428571
3,MLP,0.5,0.47619,0.416667,0.444444,0.517241,0.576923,0.545455


In [None]:
## results of train_test_split data ran multiple times

Classifier	Accuracy	Precision (Neg)	Recall (Neg)	F1-Score (Neg)	Precision (Pos)	Recall (Pos)	F1-Score (Pos)
Naive Bayes	0.98	0.956522	1	0.977778	1	0.964286	0.981818
Random Forest	0.8	0.7	0.954545	0.807692	0.95	0.678571	0.791667
SVM	0.28	0.25	0.318182	0.28	0.318182	0.25	0.28
MLP	0.46	0.368421	0.318182	0.341463	0.516129	0.571429	0.542373


Naive Bayes	0.98	0.964286	1	0.981818	1	0.956522	0.977778
Random Forest	0.94	1	0.888889	0.941176	0.884615	1	0.938776
SVM	0.22	0.269231	0.259259	0.264151	0.166667	0.173913	0.170213
MLP	0.54	0.583333	0.518519	0.54902	0.5	0.565217	0.530612


Naive Bayes	0.98	0.962963	1	0.981132	1	0.958333	0.978723
Random Forest	0.86	0.827586	0.923077	0.872727	0.904762	0.791667	0.844444
SVM	0.26	0.28	0.269231	0.27451	0.24	0.25	0.244898
MLP	0.56	0.625	0.384615	0.47619	0.529412	0.75	0.62069


Naive Bayes	0.86	0.774194	1	0.872727	1	0.730769	0.844444
Random Forest	0.84	0.75	1	0.857143	1	0.692308	0.818182
SVM	0.26	0.259259	0.291667	0.27451	0.26087	0.230769	0.244898
MLP	0.44	0.375	0.25	0.3	0.470588	0.615385	0.533333


Naive Bayes	1	1	1	1	1	1	1
Random Forest	0.9	0.866667	0.962963	0.912281	0.95	0.826087	0.883721
SVM	0.28	0.304348	0.259259	0.28	0.259259	0.304348	0.28
MLP	0.5	0.555556	0.37037	0.444444	0.46875	0.652174	0.545455



In [122]:
#k-fold cross validation (using different training/testing) sets

from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score
from sklearn.pipeline import Pipeline,make_pipeline


# classifiers defined
classifiers = {
    'Naive Bayes': GaussianNB(var_smoothing=1e-9),
    'Random Forest': RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=20, max_features='sqrt', bootstrap=False),
    'SVM' : svm.SVC(kernel = "linear"),
    'MLP' : MLPClassifier(solver='adam',activation='tanh', alpha=0.01, hidden_layer_sizes=(50,), random_state=1,max_iter=1000,learning_rate='adaptive',early_stopping=True)

}

# metrics wanted to collect
scoring_metrics = {
    'precision_neg': make_scorer(precision_score, pos_label=0, zero_division=0),
    'recall_neg': make_scorer(recall_score, pos_label=0, zero_division=0),
    'f1_neg': make_scorer(f1_score, pos_label=0, zero_division=0),
    'precision_pos': make_scorer(precision_score, pos_label=1, zero_division=0),
    'recall_pos': make_scorer(recall_score, pos_label=1, zero_division=0),
    'f1_pos': make_scorer(f1_score, pos_label=1, zero_division=0),
    'accuracy': 'accuracy'
}

n_splits=10
# Stratified K-Fold for maintaining label proportions
cv = StratifiedKFold(n_splits, shuffle=True, random_state=42)

# Results dictionary
results = {name: cross_validate(make_pipeline(StandardScaler(), clf), X, Y.values.ravel(), scoring=scoring_metrics, cv=cv, return_train_score=False) for name, clf in classifiers.items()}

# For each fold and each classifier, print out a formatted table of results
for fold_idx in range(n_splits):
    print(f"Results for fold {fold_idx + 1}")
    fold_results = []
    
    for clf_name, clf_scores in results.items():
        fold_result = {
            'Classifier': clf_name,
            'Accuracy': clf_scores['test_accuracy'][fold_idx],
            'Precision (Neg)': clf_scores['test_precision_neg'][fold_idx],
            'Recall (Neg)': clf_scores['test_recall_neg'][fold_idx],
            'F1-Score (Neg)': clf_scores['test_f1_neg'][fold_idx],
            'Precision (Pos)': clf_scores['test_precision_pos'][fold_idx],
            'Recall (Pos)': clf_scores['test_recall_pos'][fold_idx],
            'F1-Score (Pos)': clf_scores['test_f1_pos'][fold_idx]
        }
        fold_results.append(fold_result)
    
    # Convert the fold results to DataFrame
    fold_df = pd.DataFrame(fold_results)
    
    fold_markdown = fold_df.to_markdown(index=False)
    
    # Display the Markdown-formatted string 
    display(Markdown(fold_markdown))


Results for fold 1


| Classifier    |   Accuracy |   Precision (Neg) |   Recall (Neg) |   F1-Score (Neg) |   Precision (Pos) |   Recall (Pos) |   F1-Score (Pos) |
|:--------------|-----------:|------------------:|---------------:|-----------------:|------------------:|---------------:|-----------------:|
| Naive Bayes   |       1    |          1        |            1   |         1        |          1        |            1   |         1        |
| Random Forest |       0.95 |          0.909091 |            1   |         0.952381 |          1        |            0.9 |         0.947368 |
| SVM           |       0.2  |          0.25     |            0.3 |         0.272727 |          0.125    |            0.1 |         0.111111 |
| MLP           |       0.65 |          0.714286 |            0.5 |         0.588235 |          0.615385 |            0.8 |         0.695652 |

Results for fold 2


| Classifier    |   Accuracy |   Precision (Neg) |   Recall (Neg) |   F1-Score (Neg) |   Precision (Pos) |   Recall (Pos) |   F1-Score (Pos) |
|:--------------|-----------:|------------------:|---------------:|-----------------:|------------------:|---------------:|-----------------:|
| Naive Bayes   |       1    |          1        |            1   |         1        |          1        |            1   |         1        |
| Random Forest |       0.75 |          0.777778 |            0.7 |         0.736842 |          0.727273 |            0.8 |         0.761905 |
| SVM           |       0.15 |          0.181818 |            0.2 |         0.190476 |          0.111111 |            0.1 |         0.105263 |
| MLP           |       0.6  |          0.666667 |            0.4 |         0.5      |          0.571429 |            0.8 |         0.666667 |

Results for fold 3


| Classifier    |   Accuracy |   Precision (Neg) |   Recall (Neg) |   F1-Score (Neg) |   Precision (Pos) |   Recall (Pos) |   F1-Score (Pos) |
|:--------------|-----------:|------------------:|---------------:|-----------------:|------------------:|---------------:|-----------------:|
| Naive Bayes   |       1    |          1        |            1   |         1        |          1        |            1   |         1        |
| Random Forest |       0.95 |          0.909091 |            1   |         0.952381 |          1        |            0.9 |         0.947368 |
| SVM           |       0.25 |          0.222222 |            0.2 |         0.210526 |          0.272727 |            0.3 |         0.285714 |
| MLP           |       0.65 |          0.666667 |            0.6 |         0.631579 |          0.636364 |            0.7 |         0.666667 |

Results for fold 4


| Classifier    |   Accuracy |   Precision (Neg) |   Recall (Neg) |   F1-Score (Neg) |   Precision (Pos) |   Recall (Pos) |   F1-Score (Pos) |
|:--------------|-----------:|------------------:|---------------:|-----------------:|------------------:|---------------:|-----------------:|
| Naive Bayes   |       0.9  |          0.833333 |            1   |         0.909091 |          1        |            0.8 |         0.888889 |
| Random Forest |       0.85 |          0.769231 |            1   |         0.869565 |          1        |            0.7 |         0.823529 |
| SVM           |       0.25 |          0.142857 |            0.1 |         0.117647 |          0.307692 |            0.4 |         0.347826 |
| MLP           |       0.25 |          0.222222 |            0.2 |         0.210526 |          0.272727 |            0.3 |         0.285714 |

Results for fold 5


| Classifier    |   Accuracy |   Precision (Neg) |   Recall (Neg) |   F1-Score (Neg) |   Precision (Pos) |   Recall (Pos) |   F1-Score (Pos) |
|:--------------|-----------:|------------------:|---------------:|-----------------:|------------------:|---------------:|-----------------:|
| Naive Bayes   |       0.95 |          0.909091 |            1   |         0.952381 |          1        |            0.9 |         0.947368 |
| Random Forest |       0.85 |          0.769231 |            1   |         0.869565 |          1        |            0.7 |         0.823529 |
| SVM           |       0.15 |          0.181818 |            0.2 |         0.190476 |          0.111111 |            0.1 |         0.105263 |
| MLP           |       0.4  |          0.375    |            0.3 |         0.333333 |          0.416667 |            0.5 |         0.454545 |

Results for fold 6


| Classifier    |   Accuracy |   Precision (Neg) |   Recall (Neg) |   F1-Score (Neg) |   Precision (Pos) |   Recall (Pos) |   F1-Score (Pos) |
|:--------------|-----------:|------------------:|---------------:|-----------------:|------------------:|---------------:|-----------------:|
| Naive Bayes   |        1   |          1        |            1   |         1        |               1   |            1   |         1        |
| Random Forest |        0.9 |          0.833333 |            1   |         0.909091 |               1   |            0.8 |         0.888889 |
| SVM           |        0.1 |          0.1      |            0.1 |         0.1      |               0.1 |            0.1 |         0.1      |
| MLP           |        0.5 |          0.5      |            0.5 |         0.5      |               0.5 |            0.5 |         0.5      |

Results for fold 7


| Classifier    |   Accuracy |   Precision (Neg) |   Recall (Neg) |   F1-Score (Neg) |   Precision (Pos) |   Recall (Pos) |   F1-Score (Pos) |
|:--------------|-----------:|------------------:|---------------:|-----------------:|------------------:|---------------:|-----------------:|
| Naive Bayes   |       1    |          1        |            1   |         1        |             1     |            1   |         1        |
| Random Forest |       0.85 |          0.769231 |            1   |         0.869565 |             1     |            0.7 |         0.823529 |
| SVM           |       0.15 |          0.230769 |            0.3 |         0.26087  |             0     |            0   |         0        |
| MLP           |       0.6  |          0.583333 |            0.7 |         0.636364 |             0.625 |            0.5 |         0.555556 |

Results for fold 8


| Classifier    |   Accuracy |   Precision (Neg) |   Recall (Neg) |   F1-Score (Neg) |   Precision (Pos) |   Recall (Pos) |   F1-Score (Pos) |
|:--------------|-----------:|------------------:|---------------:|-----------------:|------------------:|---------------:|-----------------:|
| Naive Bayes   |       0.95 |          0.909091 |            1   |         0.952381 |          1        |            0.9 |         0.947368 |
| Random Forest |       1    |          1        |            1   |         1        |          1        |            1   |         1        |
| SVM           |       0.4  |          0.428571 |            0.6 |         0.5      |          0.333333 |            0.2 |         0.25     |
| MLP           |       0.45 |          0.444444 |            0.4 |         0.421053 |          0.454545 |            0.5 |         0.47619  |

Results for fold 9


| Classifier    |   Accuracy |   Precision (Neg) |   Recall (Neg) |   F1-Score (Neg) |   Precision (Pos) |   Recall (Pos) |   F1-Score (Pos) |
|:--------------|-----------:|------------------:|---------------:|-----------------:|------------------:|---------------:|-----------------:|
| Naive Bayes   |       1    |          1        |            1   |         1        |          1        |            1   |         1        |
| Random Forest |       0.95 |          1        |            0.9 |         0.947368 |          0.909091 |            1   |         0.952381 |
| SVM           |       0.25 |          0.272727 |            0.3 |         0.285714 |          0.222222 |            0.2 |         0.210526 |
| MLP           |       0.5  |          0.5      |            0.4 |         0.444444 |          0.5      |            0.6 |         0.545455 |

Results for fold 10


| Classifier    |   Accuracy |   Precision (Neg) |   Recall (Neg) |   F1-Score (Neg) |   Precision (Pos) |   Recall (Pos) |   F1-Score (Pos) |
|:--------------|-----------:|------------------:|---------------:|-----------------:|------------------:|---------------:|-----------------:|
| Naive Bayes   |       1    |          1        |            1   |         1        |          1        |            1   |         1        |
| Random Forest |       1    |          1        |            1   |         1        |          1        |            1   |         1        |
| SVM           |       0.3  |          0.25     |            0.2 |         0.222222 |          0.333333 |            0.4 |         0.363636 |
| MLP           |       0.35 |          0.285714 |            0.2 |         0.235294 |          0.384615 |            0.5 |         0.434783 |

- Out of all classifiers SVM is not performing well
- some values are 0 for SVM because of the following error
- UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use zero_division parameter to control this behavior.
- It is because the denominator in their calculations (the number of positive predictions) is zero.
- The zero_division parameter dictates what value to return when there is a division by zero during scoring. If zero_division=1, it means that if there are no positive predictions and therefore no true positives or false positives, the precision score is defined as 1. If zero_division=0, then the score is defined as 0.

#### Hyperparameter optimization using RandomizedSearchCV or GridSearchCV is done to improve the performance of models.
- Both aim to find the optimal set of hyperparameters for a given mode.
- GridSearchCV performs an exhaustive search over a specified parameter grid.This method will systematically explore all possible combinations of the provided hyperparameters.
- RandomizedSearchCV samples a fixed number of parameter settings from specified distributions.This method does not try every possible combination, but instead selects them randomly

In [230]:
#hyperparameter optimization using RandomizedSearchCV for RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

# Define the parameter distribution
param_distr = {
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2'],
    'random_state' : [40,42,45],
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'bootstrap': [True, False]
}

# Create a base model
rf = RandomForestClassifier()

# Instantiate the random search model
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distr,
                                   n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search to the data
random_search.fit(trainX_scaled, trainY.values.ravel())

# random_search.best_params_ contains the best parameters found
print("Best parameters found: ", random_search.best_params_)


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found:  {'random_state': 45, 'n_estimators': 200, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True}


In [106]:
#hyperparameter optimization using GridSearchCV for Support Vector Machine(SVM)

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
    'gamma': ['scale', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear', 'poly']
}

# Create a base model
sv = svm.SVC()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=sv, param_grid=param_grid, cv=5, n_jobs=4, verbose=2)

# Fit the grid search to the data
grid_search.fit(trainX_scaled, trainY.values.ravel())

# grid_search.best_params_ contains the best parameters found
print("Best parameters found: ", grid_search.best_params_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Best parameters found:  {'C': 0.001, 'gamma': 'scale', 'kernel': 'rbf'}


In [245]:
#hyperparameter optimization using GridSearchCV for MLP

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'solver':['adam','lbfgs', 'sgd'],              # Solver for weight optimization
    'alpha':[0.0001,1e-5] ,            
    'random_state':[1,None] ,  
    'learning_rate':['constant', 'invscaling', 'adaptive'],
    'hidden_layer_sizes': [(50,), (100,), (200,)], # Size of the hidden layer
    'activation': ['tanh', 'relu'],   # Activation function
    'alpha': [0.0001, 0.001, 0.01]   # L2 penalty parameter
}

# Create a base model
mlp = MLPClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(trainX_scaled, trainY.values.ravel())

# grid_search.best_params_ contains the best parameters found
print("Best parameters found: ", grid_search.best_params_)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best parameters found:  {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (100,), 'learning_rate': 'invscaling', 'random_state': None, 'solver': 'sgd'}


## Understand how Preprocessing is happening based on 2 documents 

### Original text

['i', 'went', 'and', 'saw', 'this', 'movie', 'last', 'night', 'after', 'being', 'coaxed', 'to', 'by', 'a', 'few', 'friends', 'of', 'mine', '.', 'i', "'ll", 'admit', 'that', 'i', 'was', 'reluctant', 'to', 'see', 'it', 'because', 'from', 'what', 'i', 'knew', 'of', 'ashton', 'kutcher', 'he', 'was', 'only', 'able', 'to', 'do', 'comedy', '.', 'i', 'was', 'wrong', '.', 'kutcher', 'played', 'the', 'character', 'of', 'jake', 'fischer', 'very', 'well', ',', 'and', 'kevin', 'costner', 'played', 'ben', 'randall', 'with', 'such', 'professionalism', '.', 'the', 'sign', 'of', 'a', 'good', 'movie', 'is', 'that', 'it', 'can', 'toy', 'with', 'our', 'emotions', '.', 'this', 'one', 'did', 'exactly', 'that', '.', 'the', 'entire', 'theater', '(', 'which', 'was', 'sold', 'out', ')', 'was', 'overcome', 'by', 'laughter', 'during', 'the', 'first', 'half', 'of', 'the', 'movie', ',', 'and', 'were', 'moved', 'to', 'tears', 'during', 'the', 'second', 'half', '.', 'while', 'exiting', 'the', 'theater', 'i', 'not', 'only', 'saw', 'many', 'women', 'in', 'tears', ',', 'but', 'many', 'full', 'grown', 'men', 'as', 'well', ',', 'trying', 'desperately', 'not', 'to', 'let', 'anyone', 'see', 'them', 'crying', '.', 'this', 'movie', 'was', 'great', ',', 'and', 'i', 'suggest', 'that', 'you', 'go', 'see', 'it', 'before', 'you', 'judge', '.']

['actor', 'turned', 'director', 'bill', 'paxton', 'follows', 'up', 'his', 'promising', 'debut', ',', 'the', 'gothic-horror', '``', 'frailty', "''", ',', 'with', 'this', 'family', 'friendly', 'sports', 'drama', 'about', 'the', '1913', 'u.s.', 'open', 'where', 'a', 'young', 'american', 'caddy', 'rises', 'from', 'his', 'humble', 'background', 'to', 'play', 'against', 'his', 'bristish', 'idol', 'in', 'what', 'was', 'dubbed', 'as', '``', 'the', 'greatest', 'game', 'ever', 'played', '.', "''", 'i', "'m", 'no', 'fan', 'of', 'golf', ',', 'and', 'these', 'scrappy', 'underdog', 'sports', 'flicks', 'are', 'a', 'dime', 'a', 'dozen', '(', 'most', 'recently', 'done', 'to', 'grand', 'effect', 'with', '``', 'miracle', "''", 'and', '``', 'cinderella', 'man', "''", ')', ',', 'but', 'some', 'how', 'this', 'film', 'was', 'enthralling', 'all', 'the', 'same.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'the', 'film', 'starts', 'with', 'some', 'creative', 'opening', 'credits', '(', 'imagine', 'a', 'disneyfied', 'version', 'of', 'the', 'animated', 'opening', 'credits', 'of', 'hbo', "'s", '``', 'carnivale', "''", 'and', '``', 'rome', "''", ')', ',', 'but', 'lumbers', 'along', 'slowly', 'for', 'its', 'first', 'by-the-numbers', 'hour', '.', 'once', 'the', 'action', 'moves', 'to', 'the', 'u.s.', 'open', 'things', 'pick', 'up', 'very', 'well', '.', 'paxton', 'does', 'a', 'nice', 'job', 'and', 'shows', 'a', 'knack', 'for', 'effective', 'directorial', 'flourishes', '(', 'i', 'loved', 'the', 'rain-soaked', 'montage', 'of', 'the', 'action', 'on', 'day', 'two', 'of', 'the', 'open', ')', 'that', 'propel', 'the', 'plot', 'further', 'or', 'add', 'some', 'unexpected', 'psychological', 'depth', 'to', 'the', 'proceedings', '.', 'there', "'s", 'some', 'compelling', 'character', 'development', 'when', 'the', 'british', 'harry', 'vardon', 'is', 'haunted', 'by', 'images', 'of', 'the', 'aristocrats', 'in', 'black', 'suits', 'and', 'top', 'hats', 'who', 'destroyed', 'his', 'family', 'cottage', 'as', 'a', 'child', 'to', 'make', 'way', 'for', 'a', 'golf', 'course', '.', 'he', 'also', 'does', 'a', 'good', 'job', 'of', 'visually', 'depicting', 'what', 'goes', 'on', 'in', 'the', 'players', "'", 'heads', 'under', 'pressure', '.', 'golf', ',', 'a', 'painfully', 'boring', 'sport', ',', 'is', 'brought', 'vividly', 'alive', 'here', '.', 'credit', 'should', 'also', 'be', 'given', 'the', 'set', 'designers', 'and', 'costume', 'department', 'for', 'creating', 'an', 'engaging', 'period-piece', 'atmosphere', 'of', 'london', 'and', 'boston', 'at', 'the', 'beginning', 'of', 'the', 'twentieth', 'century.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'you', 'know', 'how', 'this', 'is', 'going', 'to', 'end', 'not', 'only', 'because', 'it', "'s", 'based', 'on', 'a', 'true', 'story', 'but', 'also', 'because', 'films', 'in', 'this', 'genre', 'follow', 'the', 'same', 'template', 'over', 'and', 'over', ',', 'but', 'paxton', 'puts', 'on', 'a', 'better', 'than', 'average', 'show', 'and', 'perhaps', 'indicates', 'more', 'talent', 'behind', 'the', 'camera', 'than', 'he', 'ever', 'had', 'in', 'front', 'of', 'it', '.', 'despite', 'the', 'formulaic', 'nature', ',', 'this', 'is', 'a', 'nice', 'and', 'easy', 'film', 'to', 'root', 'for', 'that', 'deserves', 'to', 'find', 'an', 'audience', '.']

#### After removing stop words

['went', 'saw', 'movie', 'last', 'night', 'coaxed', 'friends', 'mine', '.', "'ll", 'admit', 'reluctant', 'see', 'knew', 'ashton', 'kutcher', 'able', 'comedy', '.', 'wrong', '.', 'kutcher', 'played', 'character', 'jake', 'fischer', 'well', ',', 'kevin', 'costner', 'played', 'ben', 'randall', 'professionalism', '.', 'sign', 'good', 'movie', 'toy', 'emotions', '.', 'one', 'exactly', '.', 'entire', 'theater', '(', 'sold', ')', 'overcome', 'laughter', 'first', 'half', 'movie', ',', 'moved', 'tears', 'second', 'half', '.', 'exiting', 'theater', 'saw', 'many', 'women', 'tears', ',', 'many', 'full', 'grown', 'men', 'well', ',', 'trying', 'desperately', 'let', 'anyone', 'see', 'crying', '.', 'movie', 'great', ',', 'suggest', 'go', 'see', 'judge', '.']

['actor', 'turned', 'director', 'bill', 'paxton', 'follows', 'promising', 'debut', ',', 'gothic-horror', '``', 'frailty', "''", ',', 'family', 'friendly', 'sports', 'drama', '1913', 'u.s.', 'open', 'young', 'american', 'caddy', 'rises', 'humble', 'background', 'play', 'bristish', 'idol', 'dubbed', '``', 'greatest', 'game', 'ever', 'played', '.', "''", "'m", 'fan', 'golf', ',', 'scrappy', 'underdog', 'sports', 'flicks', 'dime', 'dozen', '(', 'recently', 'done', 'grand', 'effect', '``', 'miracle', "''", '``', 'cinderella', 'man', "''", ')', ',', 'film', 'enthralling', 'same.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'film', 'starts', 'creative', 'opening', 'credits', '(', 'imagine', 'disneyfied', 'version', 'animated', 'opening', 'credits', 'hbo', "'s", '``', 'carnivale', "''", '``', 'rome', "''", ')', ',', 'lumbers', 'along', 'slowly', 'first', 'by-the-numbers', 'hour', '.', 'action', 'moves', 'u.s.', 'open', 'things', 'pick', 'well', '.', 'paxton', 'nice', 'job', 'shows', 'knack', 'effective', 'directorial', 'flourishes', '(', 'loved', 'rain-soaked', 'montage', 'action', 'day', 'two', 'open', ')', 'propel', 'plot', 'add', 'unexpected', 'psychological', 'depth', 'proceedings', '.', "'s", 'compelling', 'character', 'development', 'british', 'harry', 'vardon', 'haunted', 'images', 'aristocrats', 'black', 'suits', 'top', 'hats', 'destroyed', 'family', 'cottage', 'child', 'make', 'way', 'golf', 'course', '.', 'also', 'good', 'job', 'visually', 'depicting', 'goes', 'players', "'", 'heads', 'pressure', '.', 'golf', ',', 'painfully', 'boring', 'sport', ',', 'brought', 'vividly', 'alive', '.', 'credit', 'also', 'given', 'set', 'designers', 'costume', 'department', 'creating', 'engaging', 'period-piece', 'atmosphere', 'london', 'boston', 'beginning', 'twentieth', 'century.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'know', 'going', 'end', "'s", 'based', 'true', 'story', 'also', 'films', 'genre', 'follow', 'template', ',', 'paxton', 'puts', 'better', 'average', 'show', 'perhaps', 'indicates', 'talent', 'behind', 'camera', 'ever', 'front', '.', 'despite', 'formulaic', 'nature', ',', 'nice', 'easy', 'film', 'root', 'deserves', 'find', 'audience', '.']

### Observations:

- Data should be converted to lower case before removing stopwords
- for ex: letter i is a stopword but I is not

#### stopwords removed from all positive documents are:

['i', 'and', 'this', 'after', 'being', 'to', 'by', 'a', 'few', 'of', 'i', 'that', 'i', 'was', 'to', 'it', 'because', 'from', 'what', 'i', 'of', 'he', 'was', 'only', 'to', 'do', 'i', 'was', 'the', 'of', 'very', 'and', 'with', 'such', 'the', 'of', 'a', 'is', 'that', 'it', 'can', 'with', 'our', 'this', 'did', 'that', 'the', 'which', 'was', 'out', 'was', 'by', 'during', 'the', 'of', 'the', 'and', 'were', 'to', 'during', 'the', 'while', 'the', 'i', 'not', 'only', 'in', 'but', 'as', 'not', 'to', 'them', 'this', 'was', 'and', 'i', 'that', 'you', 'it', 'before', 'you']
['up', 'his', 'the', 'with', 'this', 'about', 'the', 'where', 'a', 'from', 'his', 'to', 'against', 'his', 'in', 'what', 'was', 'as', 'the', 'i', 'no', 'of', 'and', 'these', 'are', 'a', 'a', 'most', 'to', 'with', 'and', 'but', 'some', 'how', 'this', 'was', 'all', 'the', 'the', 'with', 'some', 'a', 'of', 'the', 'of', 'and', 'but', 'for', 'its', 'once', 'the', 'to', 'the', 'up', 'very', 'does', 'a', 'and', 'a', 'for', 'i', 'the', 'of', 'the', 'on', 'of', 'the', 'that', 'the', 'further', 'or', 'some', 'to', 'the', 'there', 'some', 'when', 'the', 'is', 'by', 'of', 'the', 'in', 'and', 'who', 'his', 'as', 'a', 'to', 'for', 'a', 'he', 'does', 'a', 'of', 'what', 'on', 'in', 'the', 'under', 'a', 'is', 'here', 'should', 'be', 'the', 'and', 'for', 'an', 'of', 'and', 'at', 'the', 'of', 'the', 'you', 'how', 'this', 'is', 'to', 'not', 'only', 'because', 'it', 'on', 'a', 'but', 'because', 'in', 'this', 'the', 'same', 'over', 'and', 'over', 'but', 'on', 'a', 'than', 'and', 'more', 'the', 'than', 'he', 'had', 'in', 'of', 'it', 'the', 'this', 'is', 'a', 'and', 'to', 'for', 'that', 'to', 'an']

### after performing lemmatization and NER using Spacy

['go', 'see', 'movie', 'TIME', 'TIME', 'coax', 'friend', '-PRON-', '.', 'will', 'admit', 'reluctant', 'see', 'know', 'ashton', 'kutcher', 'able', 'comedy', '.', 'wrong', '.', 'kutcher', 'play', 'character', 'jake', 'fischer', 'well', ',', 'kevin', 'costn', 'play', 'ben', 'randall', 'professionalism', '.', 'sign', 'good', 'movie', 'toy', 'emotion', '.', 'one', 'exactly', '.', 'entire', 'theater', '(', 'sell', ')', 'overcome', 'laughter', 'ORDINAL', 'CARDINAL', 'movie', ',', 'move', 'tear', 'ORDINAL', 'CARDINAL', '.', 'exit', 'theater', 'see', 'many', 'woman', 'tear', ',', 'many', 'full', 'grown', 'man', 'well', ',', 'try', 'desperately', 'let', 'anyone', 'see', 'cry', '.', 'movie', 'great', ',', 'suggest', 'go', 'see', 'judge', '.']

['actor', 'turn', 'director', 'bill', 'paxton', 'follow', 'promise', 'debut', ',', 'gothic', '-', 'horror', '`', '`', 'frailty', "''", ',', 'family', 'friendly', 'sport', 'drama', 'DATE', 'u.s', '.', 'open', 'young', 'NORP', 'caddy', 'rise', 'humble', 'background', 'play', 'bristish', 'idol', 'dub', '`', '`', 'great', 'game', 'ever', 'play', '.', "''", "'", 'm', 'fan', 'golf', ',', 'scrappy', 'underdog', 'sport', 'flick', 'dime', 'dozen', '(', 'recently', 'do', 'grand', 'effect', '`', '`', 'miracle', "''", '`', '`', 'cinderella', 'man', "''", ')', ',', 'film', 'enthral', 'same', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'film', 'start', 'creative', 'opening', 'credit', '(', 'imagine', 'disneyfie', 'version', 'animate', 'opening', 'credit', 'hbo', "'s", '`', '`', 'carnivale', "''", '`', '`', 'rome', "''", ')', ',', 'lumber', 'along', 'slowly', 'ORDINAL', 'by', '-', 'the', '-', 'number', 'hour', '.', 'action', 'move', 'u.s', '.', 'open', 'thing', 'pick', 'well', '.', 'paxton', 'nice', 'job', 'show', 'knack', 'effective', 'directorial', 'flourish', '(', 'love', 'rain', '-', 'soak', 'montage', 'action', 'day', 'CARDINAL', 'open', ')', 'propel', 'plot', 'add', 'unexpected', 'psychological', 'depth', 'proceeding', '.', "'s", 'compelling', 'character', 'development', 'british', 'harry', 'vardon', 'haunt', 'image', 'aristocrats', 'black', 'suit', 'top', 'hat', 'destroy', 'family', 'cottage', 'child', 'make', 'way', 'golf', 'course', '.', 'also', 'good', 'job', 'visually', 'depict', 'go', 'player', "'", 'head', 'pressure', '.', 'golf', ',', 'painfully', 'bore', 'sport', ',', 'bring', 'vividly', 'alive', '.', 'credit', 'also', 'give', 'set', 'designer', 'costume', 'department', 'create', 'engage', 'period', '-', 'piece', 'atmosphere', 'london', 'boston', 'begin', 'DATE', 'DATE', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'know', 'go', 'end', "'s", 'base', 'true', 'story', 'also', 'film', 'genre', 'follow', 'template', ',', 'paxton', 'put', 'well', 'average', 'show', 'perhaps', 'indicate', 'talent', 'behind', 'camera', 'ever', 'front', '.', 'despite', 'formulaic', 'nature', ',', 'nice', 'easy', 'film', 'root', 'deserve', 'find', 'audience', '.']

#### performing stemming(ours-mystemDic) 

['go', 'see', 'movie', 'time', 'time', 'coax', 'friend', '-pron-', '.', 'will', 'admit', 'reluctant', 'see', 'know', 'ashton', 'kutcher', 'able', 'comedy', '.', 'wrong', '.', 'kutcher', 'play', 'character', 'jake', 'fischer', 'well', ',', 'kevin', 'costn', 'play', 'ben', 'randall', 'professionalism', '.', 'sign', 'good', 'movie', 'toy', 'emotion', '.', 'one', 'exactly', '.', 'entire', 'theater', '(', 'sell', ')', 'overcome', 'laught', 'ordinal', 'cardinal', 'movie', ',', 'move', 'tear', 'ordinal', 'cardinal', '.', 'exit', 'theater', 'see', 'many', 'woman', 'tear', ',', 'many', 'full', 'grown', 'man', 'well', ',', 'try', 'desperately', 'let', 'anyone', 'see', 'cry', '.', 'movie', 'great', ',', 'suggest', 'go', 'see', 'judge', '.']

['actor', 'turn', 'director', 'bill', 'paxton', 'follow', 'promise', 'debut', ',', 'gothic', '-', 'horror', '`', '`', 'frailty', "''", ',', 'family', 'friendly', 'sport', 'drama', 'date', 'u.s', '.', 'open', 'young', 'norp', 'caddy', 'rise', 'humble', 'background', 'play', 'bristish', 'idol', 'dub', '`', '`', 'great', 'game', 'ever', 'play', '.', "''", "'", 'm', 'fan', 'golf', ',', 'scrappy', 'underdog', 'sport', 'flick', 'dime', 'dozen', '(', 'recently', 'do', 'grand', 'effect', '`', '`', 'miracle', "''", '`', '`', 'cinderella', 'man', "''", ')', ',', 'film', 'enthral', 'same', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'film', 'start', 'creative', 'opening', 'credit', '(', 'imagine', 'disneyfie', 'version', 'animate', 'open', 'credit', 'hbo', 'be', '`', '`', 'carnivale', "''", '`', '`', 'rome', "''", ')', ',', 'lumber', 'along', 'slowly', 'ordinal', 'by', '-', 'the', '-', 'number', 'hour', '.', 'action', 'move', 'u.s', '.', 'open', 'thing', 'pick', 'well', '.', 'paxton', 'nice', 'job', 'show', 'knack', 'effective', 'directorial', 'flourish', '(', 'love', 'rain', '-', 'soak', 'montage', 'action', 'day', 'cardinal', 'open', ')', 'propel', 'plot', 'add', 'unexpected', 'psychological', 'depth', 'proceeding', '.', 'be', 'compelling', 'character', 'development', 'british', 'harry', 'vardon', 'haunt', 'image', 'aristocrat', 'black', 'suit', 'top', 'hat', 'destroy', 'family', 'cottage', 'child', 'make', 'way', 'golf', 'course', '.', 'also', 'good', 'job', 'visually', 'depict', 'go', 'player', "'", 'head', 'pressure', '.', 'golf', ',', 'painfully', 'bore', 'sport', ',', 'br', 'vividly', 'alive', '.', 'credit', 'also', 'give', 'set', 'designer', 'costume', 'department', 'create', 'engage', 'period', '-', 'piece', 'atmosphere', 'london', 'boston', 'begin', 'date', 'date', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'know', 'go', 'end', 'be', 'base', 'true', 'story', 'also', 'film', 'genre', 'follow', 'template', ',', 'paxton', 'put', 'well', 'average', 'show', 'perhaps', 'indicate', 'talent', 'behind', 'camera', 'ever', 'front', '.', 'despite', 'formulaic', 'nature', ',', 'nice', 'easy', 'film', 'root', 'deserve', 'find', 'audience', '.']

#### performing stemming(porter stemmer) 

['go', 'see', 'movi', 'time', 'time', 'coax', 'friend', '-pron-', '.', 'will', 'admit', 'reluct', 'see', 'know', 'ashton', 'kutcher', 'abl', 'comedi', '.', 'wrong', '.', 'kutcher', 'play', 'charact', 'jake', 'fischer', 'well', ',', 'kevin', 'costn', 'play', 'ben', 'randal', 'profession', '.', 'sign', 'good', 'movi', 'toy', 'emot', '.', 'one', 'exactli', '.', 'entir', 'theater', '(', 'sell', ')', 'overcom', 'laughter', 'ordin', 'cardin', 'movi', ',', 'move', 'tear', 'ordin', 'cardin', '.', 'exit', 'theater', 'see', 'mani', 'woman', 'tear', ',', 'mani', 'full', 'grown', 'man', 'well', ',', 'tri', 'desper', 'let', 'anyon', 'see', 'cri', '.', 'movi', 'great', ',', 'suggest', 'go', 'see', 'judg', '.']

['actor', 'turn', 'director', 'bill', 'paxton', 'follow', 'promis', 'debut', ',', 'gothic', '-', 'horror', '`', '`', 'frailti', "''", ',', 'famili', 'friendli', 'sport', 'drama', 'date', 'u.', '.', 'open', 'young', 'norp', 'caddi', 'rise', 'humbl', 'background', 'play', 'bristish', 'idol', 'dub', '`', '`', 'great', 'game', 'ever', 'play', '.', "''", "'", 'm', 'fan', 'golf', ',', 'scrappi', 'underdog', 'sport', 'flick', 'dime', 'dozen', '(', 'recent', 'do', 'grand', 'effect', '`', '`', 'miracl', "''", '`', '`', 'cinderella', 'man', "''", ')', ',', 'film', 'enthral', 'same', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'film', 'start', 'creativ', 'open', 'credit', '(', 'imagin', 'disneyfi', 'version', 'anim', 'open', 'credit', 'hbo', "'s", '`', '`', 'carnival', "''", '`', '`', 'rome', "''", ')', ',', 'lumber', 'along', 'slowli', 'ordin', 'by', '-', 'the', '-', 'number', 'hour', '.', 'action', 'move', 'u.', '.', 'open', 'thing', 'pick', 'well', '.', 'paxton', 'nice', 'job', 'show', 'knack', 'effect', 'directori', 'flourish', '(', 'love', 'rain', '-', 'soak', 'montag', 'action', 'day', 'cardin', 'open', ')', 'propel', 'plot', 'add', 'unexpect', 'psycholog', 'depth', 'proceed', '.', "'s", 'compel', 'charact', 'develop', 'british', 'harri', 'vardon', 'haunt', 'imag', 'aristocrat', 'black', 'suit', 'top', 'hat', 'destroy', 'famili', 'cottag', 'child', 'make', 'way', 'golf', 'cours', '.', 'also', 'good', 'job', 'visual', 'depict', 'go', 'player', "'", 'head', 'pressur', '.', 'golf', ',', 'pain', 'bore', 'sport', ',', 'bring', 'vividli', 'aliv', '.', 'credit', 'also', 'give', 'set', 'design', 'costum', 'depart', 'creat', 'engag', 'period', '-', 'piec', 'atmospher', 'london', 'boston', 'begin', 'date', 'date', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'know', 'go', 'end', "'s", 'base', 'true', 'stori', 'also', 'film', 'genr', 'follow', 'templat', ',', 'paxton', 'put', 'well', 'averag', 'show', 'perhap', 'indic', 'talent', 'behind', 'camera', 'ever', 'front', '.', 'despit', 'formula', 'natur', ',', 'nice', 'easi', 'film', 'root', 'deserv', 'find', 'audienc', '.']


### obseravtions
1. we can observe that our stemmer is performing well than porter stemmer.
1. I have added Possessive ending 's, modal 'll etc.. in my dictionary which helped in good stemming
1. we can observe that many words like movi(e),reluct(ant),abl(e),comedi(y),charact(er),emot(ion),exactli(y),overcome(e),orsin(al),cardin(al),mani(y),tri(y) etc...
1. So I preferd using our stemmer in my project rather than porter stemmer


#### After removing punctuations and br(html)

['go', 'see', 'movie', 'time', 'time', 'coax', 'friend', 'pron', 'will', 'admit', 'reluctant', 'see', 'know', 'ashton', 'kutcher', 'able', 'comedy', 'wrong', 'kutcher', 'play', 'character', 'jake', 'fischer', 'well', 'kevin', 'costn', 'play', 'ben', 'randall', 'professionalism', 'sign', 'good', 'movie', 'toy', 'emotion', 'one', 'exactly', 'entire', 'theater', 'sell', 'overcome', 'laught', 'ordinal', 'cardinal', 'movie', 'move', 'tear', 'ordinal', 'cardinal', 'exit', 'theater', 'see', 'many', 'woman', 'tear', 'many', 'full', 'grown', 'man', 'well', 'try', 'desperately', 'let', 'anyone', 'see', 'cry', 'movie', 'great', 'suggest', 'go', 'see', 'judge']

['actor', 'turn', 'director', 'bill', 'paxton', 'follow', 'promise', 'debut', 'gothic', 'horror', 'frailty', 'family', 'friendly', 'sport', 'drama', 'date', 'u.s', 'open', 'young', 'norp', 'caddy', 'rise', 'humble', 'background', 'play', 'bristish', 'idol', 'dub', 'great', 'game', 'ever', 'play', 'm', 'fan', 'golf', 'scrappy', 'underdog', 'sport', 'flick', 'dime', 'dozen', 'recently', 'do', 'grand', 'effect', 'miracle', 'cinderella', 'man', 'film', 'enthral', 'same', 'film', 'start', 'creative', 'opening', 'credit', 'imagine', 'disneyfie', 'version', 'animate', 'open', 'credit', 'hbo', 'be', 'carnivale', 'rome', 'lumber', 'along', 'slowly', 'ordinal', 'by', 'the', 'number', 'hour', 'action', 'move', 'u.s', 'open', 'thing', 'pick', 'well', 'paxton', 'nice', 'job', 'show', 'knack', 'effective', 'directorial', 'flourish', 'love', 'rain', 'soak', 'montage', 'action', 'day', 'cardinal', 'open', 'propel', 'plot', 'add', 'unexpected', 'psychological', 'depth', 'proceeding', 'be', 'compelling', 'character', 'development', 'british', 'harry', 'vardon', 'haunt', 'image', 'aristocrat', 'black', 'suit', 'top', 'hat', 'destroy', 'family', 'cottage', 'child', 'make', 'way', 'golf', 'course', 'also', 'good', 'job', 'visually', 'depict', 'go', 'player', 'head', 'pressure', 'golf', 'painfully', 'bore', 'sport', 'vividly', 'alive', 'credit', 'also', 'give', 'set', 'designer', 'costume', 'department', 'create', 'engage', 'period', 'piece', 'atmosphere', 'london', 'boston', 'begin', 'date', 'date', 'know', 'go', 'end', 'be', 'base', 'true', 'story', 'also', 'film', 'genre', 'follow', 'template', 'paxton', 'put', 'well', 'average', 'show', 'perhaps', 'indicate', 'talent', 'behind', 'camera', 'ever', 'front', 'despite', 'formulaic', 'nature', 'nice', 'easy', 'film', 'root', 'deserve', 'find', 'audience']

## Using NLTK
#### performing lemmatization and NER using NLTK (done before stopwords and infrequent words removal)

['I', 'go', 'and', 'saw', 'this', 'movie', 'last', 'night', 'after', 'be', 'coax', 'to', 'by', 'a', 'few', 'friend', 'of', 'mine', '.', 'I', "'ll", 'admit', 'that', 'I', 'be', 'reluctant', 'to', 'see', 'it', 'because', 'from', 'what', 'I', 'know', 'of', 'PERSON', 'Kutcher', 'he', 'be', 'only', 'able', 'to', 'do', 'comedy', '.', 'I', 'be', 'wrong', '.', 'PERSON', 'play', 'the', 'character', 'of', 'PERSON', 'PERSON', 'very', 'well', ',', 'and', 'PERSON', 'PERSON', 'play', 'PERSON', 'PERSON', 'with', 'such', 'professionalism', '.', 'The', 'sign', 'of', 'a', 'good', 'movie', 'be', 'that', 'it', 'can', 'toy', 'with', 'our', 'emotion', '.', 'This', 'one', 'do', 'exactly', 'that', '.', 'The', 'entire', 'theater', '(', 'which', 'be', 'sell', 'out', ')', 'be', 'overcome', 'by', 'laughter', 'during', 'the', 'first', 'half', 'of', 'the', 'movie', ',', 'and', 'be', 'move', 'to', 'tear', 'during', 'the', 'second', 'half', '.', 'While', 'exit', 'the', 'theater', 'I', 'not', 'only', 'saw', 'many', 'woman', 'in', 'tear', ',', 'but', 'many', 'full', 'grow', 'men', 'a', 'well', ',', 'try', 'desperately', 'not', 'to', 'let', 'anyone', 'see', 'them', 'cry', '.', 'This', 'movie', 'be', 'great', ',', 'and', 'I', 'suggest', 'that', 'you', 'go', 'see', 'it', 'before', 'you', 'judge', '.']

['PERSON', 'turn', 'director', 'PERSON', 'PERSON', 'follow', 'up', 'his', 'promising', 'debut', ',', 'the', 'Gothic-horror', '``', 'Frailty', "''", ',', 'with', 'this', 'family', 'friendly', 'sport', 'drama', 'about', 'the', '1913', 'GPE', 'Open', 'where', 'a', 'young', 'GPE', 'caddy', 'rise', 'from', 'his', 'humble', 'background', 'to', 'play', 'against', 'his', 'GPE', 'idol', 'in', 'what', 'be', 'dub', 'a', '``', 'The', 'GPE', 'Game', 'Ever', 'Played', '.', "''", 'I', "'m", 'no', 'fan', 'of', 'golf', ',', 'and', 'these', 'scrappy', 'underdog', 'sport', 'flick', 'be', 'a', 'dime', 'a', 'dozen', '(', 'most', 'recently', 'do', 'to', 'grand', 'effect', 'with', '``', 'Miracle', "''", 'and', '``', 'PERSON', 'PERSON', "''", ')', ',', 'but', 'some', 'how', 'this', 'film', 'be', 'enthral', 'all', 'the', 'same.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'The', 'film', 'start', 'with', 'some', 'creative', 'opening', 'credit', '(', 'imagine', 'a', 'Disneyfied', 'version', 'of', 'the', 'animated', 'opening', 'credit', 'of', 'ORGANIZATION', "'s", '``', 'Carnivale', "''", 'and', '``', 'Rome', "''", ')', ',', 'but', 'lumber', 'along', 'slowly', 'for', 'it', 'first', 'by-the-numbers', 'hour', '.', 'Once', 'the', 'action', 'move', 'to', 'the', 'GPE', 'Open', 'thing', 'pick', 'up', 'very', 'well', '.', 'PERSON', 'do', 'a', 'nice', 'job', 'and', 'show', 'a', 'knack', 'for', 'effective', 'directorial', 'flourish', '(', 'I', 'love', 'the', 'rain-soaked', 'montage', 'of', 'the', 'action', 'on', 'day', 'two', 'of', 'the', 'open', ')', 'that', 'propel', 'the', 'plot', 'far', 'or', 'add', 'some', 'unexpected', 'psychological', 'depth', 'to', 'the', 'proceeding', '.', 'There', "'s", 'some', 'compelling', 'character', 'development', 'when', 'the', 'GPE', 'PERSON', 'PERSON', 'be', 'haunt', 'by', 'image', 'of', 'the', 'aristocrat', 'in', 'black', 'suit', 'and', 'top', 'hat', 'who', 'destroy', 'his', 'family', 'cottage', 'a', 'a', 'child', 'to', 'make', 'way', 'for', 'a', 'golf', 'course', '.', 'He', 'also', 'do', 'a', 'good', 'job', 'of', 'visually', 'depict', 'what', 'go', 'on', 'in', 'the', 'player', "'", 'head', 'under', 'pressure', '.', 'PERSON', ',', 'a', 'painfully', 'boring', 'sport', ',', 'be', 'bring', 'vividly', 'alive', 'here', '.', 'PERSON', 'should', 'also', 'be', 'give', 'the', 'set', 'designer', 'and', 'costume', 'department', 'for', 'create', 'an', 'engage', 'period-piece', 'atmosphere', 'of', 'GPE', 'and', 'GPE', 'at', 'the', 'beginning', 'of', 'the', 'twentieth', 'century.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'You', 'know', 'how', 'this', 'be', 'go', 'to', 'end', 'not', 'only', 'because', 'it', "'s", 'base', 'on', 'a', 'true', 'story', 'but', 'also', 'because', 'film', 'in', 'this', 'genre', 'follow', 'the', 'same', 'template', 'over', 'and', 'over', ',', 'but', 'PERSON', 'put', 'on', 'a', 'good', 'than', 'average', 'show', 'and', 'perhaps', 'indicate', 'more', 'talent', 'behind', 'the', 'camera', 'than', 'he', 'ever', 'have', 'in', 'front', 'of', 'it', '.', 'Despite', 'the', 'formulaic', 'nature', ',', 'this', 'be', 'a', 'nice', 'and', 'easy', 'film', 'to', 'root', 'for', 'that', 'deserve', 'to', 'find', 'an', 'audience', '.']

### Obseravtions:

1. NLTK has only few NER's like GeoPoliticalEntities(GPE),PERSON,ORGANIZATION
2. Space has many NER's like PERSON,NORP,FAC,ORG,GPE,LOC,PRODUCT,EVENT,WORK_OF_ART,LAW,LANGUAGE,DATE,TIME,PERCENT,
MONEY,QUALTY,ORDINAL,CARDINAL.
1. NLTK does NER(Named Entity Recognition) when first letter capitalized. For example:
 - Microsoft is considered as ORGANIZATION but microsoft is not
 - Devi is considered as PERSON but devi is not
1. So I have to run NLTK NER before converting to lower case
1. Lemmatization by NLTK doesnot perform well like Spacy. some words are not lemmatized properly.For ex:
 - saw not converted to see,men not converted to man etc..

#### removed stopwords from lemmatized and ner done by NTLK

['go', 'saw', 'movie', 'last', 'night', 'coax', 'friend', 'mine', '.', "'ll", 'admit', 'reluctant', 'see', 'know', 'person', 'kutcher', 'able', 'comedy', '.', 'wrong', '.', 'person', 'play', 'character', 'person', 'person', 'well', ',', 'person', 'person', 'play', 'person', 'person', 'professionalism', '.', 'sign', 'good', 'movie', 'toy', 'emotion', '.', 'one', 'exactly', '.', 'entire', 'theater', '(', 'sell', ')', 'overcome', 'laughter', 'first', 'half', 'movie', ',', 'move', 'tear', 'second', 'half', '.', 'exit', 'theater', 'saw', 'many', 'woman', 'tear', ',', 'many', 'full', 'grow', 'men', 'well', ',', 'try', 'desperately', 'let', 'anyone', 'see', 'cry', '.', 'movie', 'great', ',', 'suggest', 'go', 'see', 'judge', '.']

['person', 'turn', 'director', 'person', 'person', 'follow', 'promising', 'debut', ',', 'gothic-horror', '``', 'frailty', '``', ',', 'family', 'friendly', 'sport', 'drama', '1913', 'gpe', 'open', 'young', 'gpe', 'caddy', 'rise', 'humble', 'background', 'play', 'gpe', 'idol', 'dub', '``', 'gpe', 'game', 'ever', 'played', '.', '``', "'m", 'fan', 'golf', ',', 'scrappy', 'underdog', 'sport', 'flick', 'dime', 'dozen', '(', 'recently', 'grand', 'effect', '``', 'miracle', '``', '``', 'person', 'person', '``', ')', ',', 'film', 'enthral', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'film', 'start', 'creative', 'opening', 'credit', '(', 'imagine', 'disneyfied', 'version', 'animated', 'opening', 'credit', 'organization', "'s", '``', 'carnivale', '``', '``', 'rome', '``', ')', ',', 'lumber', 'along', 'slowly', 'first', 'by-the-numbers', 'hour', '.', 'action', 'move', 'gpe', 'open', 'thing', 'pick', 'well', '.', 'person', 'nice', 'job', 'show', 'knack', 'effective', 'directorial', 'flourish', '(', 'love', 'rain-soaked', 'montage', 'action', 'day', 'two', 'open', ')', 'propel', 'plot', 'far', 'add', 'unexpected', 'psychological', 'depth', 'proceeding', '.', "'s", 'compelling', 'character', 'development', 'gpe', 'person', 'person', 'haunt', 'image', 'aristocrat', 'black', 'suit', 'top', 'hat', 'destroy', 'family', 'cottage', 'child', 'make', 'way', 'golf', 'course', '.', 'also', 'good', 'job', 'visually', 'depict', 'go', 'player', "'", 'head', 'pressure', '.', 'person', ',', 'painfully', 'boring', 'sport', ',', 'bring', 'vividly', 'alive', '.', 'person', 'also', 'give', 'set', 'designer', 'costume', 'department', 'create', 'engage', 'period-piece', 'atmosphere', 'gpe', 'gpe', 'beginning', 'twentieth', 'century', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'know', 'go', 'end', "'s", 'base', 'true', 'story', 'also', 'film', 'genre', 'follow', 'template', ',', 'person', 'put', 'good', 'average', 'show', 'perhaps', 'indicate', 'talent', 'behind', 'camera', 'ever', 'front', '.', 'despite', 'formulaic', 'nature', ',', 'nice', 'easy', 'film', 'root', 'deserve', 'find', 'audience', '.']

#### stemming using our stemmer

['go', 'saw', 'movie', 'last', 'night', 'coax', 'friend', 'mine', '.', 'be', 'admit', 'reluctant', 'see', 'know', 'person', 'kutch', 'able', 'comedy', '.', 'wrong', '.', 'person', 'play', 'charact', 'person', 'person', 'well', ',', 'person', 'person', 'play', 'person', 'person', 'professionalism', '.', 'sign', 'good', 'movie', 'toy', 'emotion', '.', 'one', 'exactly', '.', 'entire', 'theater', '(', 'sell', ')', 'overcome', 'laughter', 'first', 'half', 'movie', ',', 'move', 'tear', 'second', 'half', '.', 'exit', 'theater', 'see', 'many', 'woman', 'tear', ',', 'many', 'full', 'grow', 'men', 'well', ',', 'try', 'desperately', 'let', 'anyone', 'see', 'cry', '.', 'movie', 'great', ',', 'suggest', 'go', 'see', 'judge', '.']

['person', 'turn', 'director', 'person', 'person', 'follow', 'promising', 'debut', ',', 'gothic-horror', '``', 'frailty', '``', ',', 'family', 'friendly', 'sport', 'drama', '1913', 'gpe', 'open', 'young', 'gpe', 'caddy', 'rise', 'humble', 'background', 'play', 'gpe', 'idol', 'dub', '``', 'gpe', 'game', 'ever', 'play', '.', '``', 'am', 'fan', 'golf', ',', 'scrappy', 'underdog', 'sport', 'flick', 'dime', 'dozen', '(', 'recently', 'grand', 'effect', '``', 'miracle', '``', '``', 'person', 'person', '``', ')', ',', 'film', 'enthral', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'film', 'start', 'creative', 'opening', 'credit', '(', 'imagine', 'disneyfi', 'version', 'animat', 'open', 'credit', 'organization', 'be', '``', 'carnivale', '``', '``', 'rome', '``', ')', ',', 'lumber', 'along', 'slowly', 'first', 'by-the-number', 'hour', '.', 'action', 'move', 'gpe', 'open', 'thing', 'pick', 'well', '.', 'person', 'nice', 'job', 'show', 'knack', 'effective', 'directorial', 'flourish', '(', 'love', 'rain-soaked', 'montage', 'action', 'day', 'two', 'open', ')', 'propel', 'plot', 'far', 'add', 'unexpected', 'psychological', 'depth', 'proceeding', '.', 'be', 'compelling', 'character', 'development', 'gpe', 'person', 'person', 'haunt', 'image', 'aristocrat', 'black', 'suit', 'top', 'hat', 'destroy', 'family', 'cottage', 'child', 'make', 'way', 'golf', 'course', '.', 'also', 'good', 'job', 'visually', 'depict', 'go', 'player', "'", 'head', 'pressure', '.', 'person', ',', 'painfully', 'bor', 'sport', ',', 'br', 'vividly', 'alive', '.', 'person', 'also', 'give', 'set', 'designer', 'costume', 'department', 'create', 'engage', 'period-piece', 'atmosphere', 'gpe', 'gpe', 'beginn', 'twentieth', 'century', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'know', 'go', 'end', 'be', 'base', 'true', 'story', 'also', 'film', 'genre', 'follow', 'template', ',', 'person', 'put', 'good', 'average', 'show', 'perhaps', 'indicate', 'talent', 'behind', 'camera', 'ever', 'front', '.', 'despite', 'formulaic', 'nature', ',', 'nice', 'easy', 'film', 'root', 'deserve', 'find', 'audience', '.']

#### stemming using porterstemmer 

['go', 'saw', 'movi', 'last', 'night', 'coax', 'friend', 'mine', '.', "'ll", 'admit', 'reluct', 'see', 'know', 'person', 'kutcher', 'abl', 'comedi', '.', 'wrong', '.', 'person', 'play', 'charact', 'person', 'person', 'well', ',', 'person', 'person', 'play', 'person', 'person', 'profession', '.', 'sign', 'good', 'movi', 'toy', 'emot', '.', 'one', 'exactli', '.', 'entir', 'theater', '(', 'sell', ')', 'overcom', 'laughter', 'first', 'half', 'movi', ',', 'move', 'tear', 'second', 'half', '.', 'exit', 'theater', 'saw', 'mani', 'woman', 'tear', ',', 'mani', 'full', 'grow', 'men', 'well', ',', 'tri', 'desper', 'let', 'anyon', 'see', 'cri', '.', 'movi', 'great', ',', 'suggest', 'go', 'see', 'judg', '.']

['person', 'turn', 'director', 'person', 'person', 'follow', 'promis', 'debut', ',', 'gothic-horror', '``', 'frailti', '``', ',', 'famili', 'friendli', 'sport', 'drama', '1913', 'gpe', 'open', 'young', 'gpe', 'caddi', 'rise', 'humbl', 'background', 'play', 'gpe', 'idol', 'dub', '``', 'gpe', 'game', 'ever', 'play', '.', '``', "'m", 'fan', 'golf', ',', 'scrappi', 'underdog', 'sport', 'flick', 'dime', 'dozen', '(', 'recent', 'grand', 'effect', '``', 'miracl', '``', '``', 'person', 'person', '``', ')', ',', 'film', 'enthral', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'film', 'start', 'creativ', 'open', 'credit', '(', 'imagin', 'disneyfi', 'version', 'anim', 'open', 'credit', 'organ', "'s", '``', 'carnival', '``', '``', 'rome', '``', ')', ',', 'lumber', 'along', 'slowli', 'first', 'by-the-numb', 'hour', '.', 'action', 'move', 'gpe', 'open', 'thing', 'pick', 'well', '.', 'person', 'nice', 'job', 'show', 'knack', 'effect', 'directori', 'flourish', '(', 'love', 'rain-soak', 'montag', 'action', 'day', 'two', 'open', ')', 'propel', 'plot', 'far', 'add', 'unexpect', 'psycholog', 'depth', 'proceed', '.', "'s", 'compel', 'charact', 'develop', 'gpe', 'person', 'person', 'haunt', 'imag', 'aristocrat', 'black', 'suit', 'top', 'hat', 'destroy', 'famili', 'cottag', 'child', 'make', 'way', 'golf', 'cours', '.', 'also', 'good', 'job', 'visual', 'depict', 'go', 'player', "'", 'head', 'pressur', '.', 'person', ',', 'pain', 'bore', 'sport', ',', 'bring', 'vividli', 'aliv', '.', 'person', 'also', 'give', 'set', 'design', 'costum', 'depart', 'creat', 'engag', 'period-piec', 'atmospher', 'gpe', 'gpe', 'begin', 'twentieth', 'centuri', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'know', 'go', 'end', "'s", 'base', 'true', 'stori', 'also', 'film', 'genr', 'follow', 'templat', ',', 'person', 'put', 'good', 'averag', 'show', 'perhap', 'indic', 'talent', 'behind', 'camera', 'ever', 'front', '.', 'despit', 'formula', 'natur', ',', 'nice', 'easi', 'film', 'root', 'deserv', 'find', 'audienc', '.']

#### After removing punctuations (from lemmatized,NER by NLTK)
['go', 'saw', 'movie', 'last', 'night', 'coax', 'friend', 'mine', 'be', 'admit', 'reluctant', 'see', 'know', 'person', 'kutch', 'able', 'comedy', 'wrong', 'person', 'play', 'charact', 'person', 'person', 'well', 'person', 'person', 'play', 'person', 'person', 'professionalism', 'sign', 'good', 'movie', 'toy', 'emotion', 'one', 'exactly', 'entire', 'theater', 'sell', 'overcome', 'laughter', 'first', 'half', 'movie', 'move', 'tear', 'second', 'half', 'exit', 'theater', 'see', 'many', 'woman', 'tear', 'many', 'full', 'grow', 'men', 'well', 'try', 'desperately', 'let', 'anyone', 'see', 'cry', 'movie', 'great', 'suggest', 'go', 'see', 'judge']

['person', 'turn', 'director', 'person', 'person', 'follow', 'promising', 'debut', 'gothic', 'horror', 'frailty', 'family', 'friendly', 'sport', 'drama', '1913', 'gpe', 'open', 'young', 'gpe', 'caddy', 'rise', 'humble', 'background', 'play', 'gpe', 'idol', 'dub', 'gpe', 'game', 'ever', 'play', 'am', 'fan', 'golf', 'scrappy', 'underdog', 'sport', 'flick', 'dime', 'dozen', 'recently', 'grand', 'effect', 'miracle', 'person', 'person', 'film', 'enthral', 'film', 'start', 'creative', 'opening', 'credit', 'imagine', 'disneyfi', 'version', 'animat', 'open', 'credit', 'organization', 'be', 'carnivale', 'rome', 'lumber', 'along', 'slowly', 'first', 'by', 'the', 'number', 'hour', 'action', 'move', 'gpe', 'open', 'thing', 'pick', 'well', 'person', 'nice', 'job', 'show', 'knack', 'effective', 'directorial', 'flourish', 'love', 'rain', 'soaked', 'montage', 'action', 'day', 'two', 'open', 'propel', 'plot', 'far', 'add', 'unexpected', 'psychological', 'depth', 'proceeding', 'be', 'compelling', 'character', 'development', 'gpe', 'person', 'person', 'haunt', 'image', 'aristocrat', 'black', 'suit', 'top', 'hat', 'destroy', 'family', 'cottage', 'child', 'make', 'way', 'golf', 'course', 'also', 'good', 'job', 'visually', 'depict', 'go', 'player', 'head', 'pressure', 'person', 'painfully', 'bor', 'sport', 'vividly', 'alive', 'person', 'also', 'give', 'set', 'designer', 'costume', 'department', 'create', 'engage', 'period', 'piece', 'atmosphere', 'gpe', 'gpe', 'beginn', 'twentieth', 'century', 'know', 'go', 'end', 'be', 'base', 'true', 'story', 'also', 'film', 'genre', 'follow', 'template', 'person', 'put', 'good', 'average', 'show', 'perhaps', 'indicate', 'talent', 'behind', 'camera', 'ever', 'front', 'despite', 'formulaic', 'nature', 'nice', 'easy', 'film', 'root', 'deserve', 'find', 'audience']