# Natural Language Processing

In [60]:
import pandas as pd
import numpy as np
import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [61]:
# conda install -c anaconda nltk


In [62]:
kittycat = 'We are all agreeing with the cats on this one, and she is too!'

### Basic Cleanup

In [63]:
import re
def clean_up(text):
    import re
    text = re.sub('[^A-Za-z0-9 ]','',text)
    text = text.lower().strip()
    return text
kittycat_clean = clean_up(kittycat)
print(kittycat_clean)

we are all agreeing with the cats on this one and she is too


### Tokenization

In [64]:
from nltk.tokenize import word_tokenize
kittycat_tokenize = word_tokenize(kittycat_clean)
print(kittycat_tokenize)

['we', 'are', 'all', 'agreeing', 'with', 'the', 'cats', 'on', 'this', 'one', 'and', 'she', 'is', 'too']


### Lemmatization

In [65]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [66]:
kittycat_lemmatize = [lemmatizer.lemmatize(item) for item in kittycat_tokenize]
print(kittycat_lemmatize)

['we', 'are', 'all', 'agreeing', 'with', 'the', 'cat', 'on', 'this', 'one', 'and', 'she', 'is', 'too']


### Stemming

In [67]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

In [68]:
kittycat_stem = [stemmer.stem(item) for item in kittycat_lemmatize]
print(kittycat_stem)

['we', 'are', 'all', 'agre', 'with', 'the', 'cat', 'on', 'this', 'one', 'and', 'she', 'is', 'too']


### Removing Stopwords

In [69]:
from nltk.corpus import stopwords

stopwords_list = stopwords.words("english")
kittycat_nostopwords = [item for item in kittycat_stem if not item in stopwords_list]
print(kittycat_nostopwords)

['agre', 'cat', 'one']


In [70]:
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Vectorizing Text

In [71]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(_VectorizerMixin, sklearn.base.BaseEstimator)
 |  CountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
 |  
 |  Convert a collection of text documents to a matrix of token counts
 |  
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |  
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ------

In [72]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [73]:
vectorizer.fit_transform(kittycat_nostopwords).toarray()

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])

In [110]:
vectorizer.fit_transform(['we', 'are', 'all', 'agreeing', 'with']).toarray()

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 2, 0, 0, 0, 0],
       [1, 2, 2, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 1, 1]])

## Applying to real data: IMDB movie reviews

Get the data from here: http://ai.stanford.edu/~amaas/data/sentiment/

An example walkthrough: https://dropsofai.com/sentiment-analysis-with-python-bag-of-words/

In [75]:
from pathlib import Path

# reading positive reviews
txt_folder = Path('aclImdb/train/pos').rglob('*.txt')
files = [x for x in txt_folder]
content = []
for name in files:
    f = open(name, 'r')  
    content.append(f.readlines()[0])
    f.close()
pos = pd.DataFrame(content)
pos.head()

Unnamed: 0,0
0,For a movie that gets no respect there sure ar...
1,Bizarre horror movie filled with famous faces ...
2,"A solid, if unremarkable film. Matthau, as Ein..."
3,It's a strange feeling to sit alone in a theat...
4,"You probably all already know this by now, but..."


In [76]:
# reading negative reviews
txt_folder = Path('aclImdb/train/neg').rglob('*.txt')
files = [x for x in txt_folder]
content = []
for name in files:
    f = open(name, 'r')  
    content.append(f.readlines()[0])
    f.close()
neg = pd.DataFrame(content)
neg.head()

Unnamed: 0,0
0,Working with one of the best Shakespeare sourc...
1,"Well...tremors I, the original started off in ..."
2,Ouch! This one was a bit painful to sit throug...
3,"I've seen some crappy movies in my life, but t..."
4,"""Carriers"" follows the exploits of two guys an..."


In [77]:
# we will try to predict whether a review is positive or negative
pos['target'] = 1
neg['target'] = 0

In [78]:
# putting both dataframes together
df = pd.concat([pos, neg], axis = 0)
df.rename(columns = {0:'review'}, inplace = True) #change columne name to review

In [79]:
df.head()

Unnamed: 0,review,target
0,For a movie that gets no respect there sure ar...,1
1,Bizarre horror movie filled with famous faces ...,1
2,"A solid, if unremarkable film. Matthau, as Ein...",1
3,It's a strange feeling to sit alone in a theat...,1
4,"You probably all already know this by now, but...",1


In [89]:
# the dataset is very large, so we are only taking a subset for analysis
df = df.sample(frac=0.25)

## Preparing the data

In [81]:
df['target'].value_counts()

0    3204
1    3046
Name: target, dtype: int64

In [82]:
df['review_clean'] = df['review'].apply(clean_up) # we alreadydefine clean upat the top of our kernel
df.head()

Unnamed: 0,review,target,review_clean
5834,Horror movie??really???? i cant believe how ba...,0,horror moviereally i cant believe how bad this...
6812,I can say without a shadow of a doubt that Goi...,0,i can say without a shadow of a doubt that goi...
3639,I rented this film out having heard of the fus...,0,i rented this film out having heard of the fus...
12472,"Beautiful and touching movie. Rich colors, gre...",1,beautiful and touching movie rich colors great...
3849,A fondly-remembered melodrama  thanks chiefly...,1,a fondlyremembered melodrama thanks chiefly t...


## Tokenization

In [83]:
from nltk.tokenize import word_tokenize
df['review_tokenize'] = df['review_clean'].apply(word_tokenize)
df.head()

Unnamed: 0,review,target,review_clean,review_tokenize
5834,Horror movie??really???? i cant believe how ba...,0,horror moviereally i cant believe how bad this...,"[horror, moviereally, i, cant, believe, how, b..."
6812,I can say without a shadow of a doubt that Goi...,0,i can say without a shadow of a doubt that goi...,"[i, can, say, without, a, shadow, of, a, doubt..."
3639,I rented this film out having heard of the fus...,0,i rented this film out having heard of the fus...,"[i, rented, this, film, out, having, heard, of..."
12472,"Beautiful and touching movie. Rich colors, gre...",1,beautiful and touching movie rich colors great...,"[beautiful, and, touching, movie, rich, colors..."
3849,A fondly-remembered melodrama  thanks chiefly...,1,a fondlyremembered melodrama thanks chiefly t...,"[a, fondlyremembered, melodrama, thanks, chief..."


## Lemmatization

In [84]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

df['review_lemmatize'] = df['review_tokenize'].apply(lambda row: [lemmatizer.lemmatize(item) for item in row])
df.head()

Unnamed: 0,review,target,review_clean,review_tokenize,review_lemmatize
5834,Horror movie??really???? i cant believe how ba...,0,horror moviereally i cant believe how bad this...,"[horror, moviereally, i, cant, believe, how, b...","[horror, moviereally, i, cant, believe, how, b..."
6812,I can say without a shadow of a doubt that Goi...,0,i can say without a shadow of a doubt that goi...,"[i, can, say, without, a, shadow, of, a, doubt...","[i, can, say, without, a, shadow, of, a, doubt..."
3639,I rented this film out having heard of the fus...,0,i rented this film out having heard of the fus...,"[i, rented, this, film, out, having, heard, of...","[i, rented, this, film, out, having, heard, of..."
12472,"Beautiful and touching movie. Rich colors, gre...",1,beautiful and touching movie rich colors great...,"[beautiful, and, touching, movie, rich, colors...","[beautiful, and, touching, movie, rich, color,..."
3849,A fondly-remembered melodrama  thanks chiefly...,1,a fondlyremembered melodrama thanks chiefly t...,"[a, fondlyremembered, melodrama, thanks, chief...","[a, fondlyremembered, melodrama, thanks, chief..."


## Stemming

In [85]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

df['review_stem'] = df['review_lemmatize'].apply(lambda row: [stemmer.stem(item) for item in row])
df.head()

Unnamed: 0,review,target,review_clean,review_tokenize,review_lemmatize,review_stem
5834,Horror movie??really???? i cant believe how ba...,0,horror moviereally i cant believe how bad this...,"[horror, moviereally, i, cant, believe, how, b...","[horror, moviereally, i, cant, believe, how, b...","[horror, movier, i, cant, believ, how, bad, th..."
6812,I can say without a shadow of a doubt that Goi...,0,i can say without a shadow of a doubt that goi...,"[i, can, say, without, a, shadow, of, a, doubt...","[i, can, say, without, a, shadow, of, a, doubt...","[i, can, say, without, a, shadow, of, a, doubt..."
3639,I rented this film out having heard of the fus...,0,i rented this film out having heard of the fus...,"[i, rented, this, film, out, having, heard, of...","[i, rented, this, film, out, having, heard, of...","[i, rent, this, film, out, have, heard, of, th..."
12472,"Beautiful and touching movie. Rich colors, gre...",1,beautiful and touching movie rich colors great...,"[beautiful, and, touching, movie, rich, colors...","[beautiful, and, touching, movie, rich, color,...","[beauti, and, touch, movi, rich, color, great,..."
3849,A fondly-remembered melodrama  thanks chiefly...,1,a fondlyremembered melodrama thanks chiefly t...,"[a, fondlyremembered, melodrama, thanks, chief...","[a, fondlyremembered, melodrama, thanks, chief...","[a, fondlyrememb, melodrama, thank, chiefli, t..."


## Removing Stopwords

In [86]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words("english")

df['review_nostopwords'] = df['review_stem'].apply(lambda row: [item for item in row if not item in stopwords_list])
df.head()

Unnamed: 0,review,target,review_clean,review_tokenize,review_lemmatize,review_stem,review_nostopwords
5834,Horror movie??really???? i cant believe how ba...,0,horror moviereally i cant believe how bad this...,"[horror, moviereally, i, cant, believe, how, b...","[horror, moviereally, i, cant, believe, how, b...","[horror, movier, i, cant, believ, how, bad, th...","[horror, movier, cant, believ, bad, movi, wasw..."
6812,I can say without a shadow of a doubt that Goi...,0,i can say without a shadow of a doubt that goi...,"[i, can, say, without, a, shadow, of, a, doubt...","[i, can, say, without, a, shadow, of, a, doubt...","[i, can, say, without, a, shadow, of, a, doubt...","[say, without, shadow, doubt, go, overboard, s..."
3639,I rented this film out having heard of the fus...,0,i rented this film out having heard of the fus...,"[i, rented, this, film, out, having, heard, of...","[i, rented, this, film, out, having, heard, of...","[i, rent, this, film, out, have, heard, of, th...","[rent, film, heard, fuss, put, academi, award,..."
12472,"Beautiful and touching movie. Rich colors, gre...",1,beautiful and touching movie rich colors great...,"[beautiful, and, touching, movie, rich, colors...","[beautiful, and, touching, movie, rich, color,...","[beauti, and, touch, movi, rich, color, great,...","[beauti, touch, movi, rich, color, great, set,..."
3849,A fondly-remembered melodrama  thanks chiefly...,1,a fondlyremembered melodrama thanks chiefly t...,"[a, fondlyremembered, melodrama, thanks, chief...","[a, fondlyremembered, melodrama, thanks, chief...","[a, fondlyrememb, melodrama, thank, chiefli, t...","[fondlyrememb, melodrama, thank, chiefli, rona..."


## Vectorizing Text

In [90]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=lambda x: x)

In [91]:
X = vectorizer.fit_transform(df['review_nostopwords']).toarray()

In [92]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Splitting into train and test set

In [93]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df['target'], test_size=0.33, random_state=42)

In [98]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression()

In [99]:
pred = clf.predict(X_test)

In [109]:
# pred

In [101]:
clf.score(X_test, y_test)

0.8313953488372093

In [102]:
from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, pred))
print(recall_score(y_test, pred))
print(f1_score(y_test, pred))

0.8076923076923077
0.8502024291497976
0.8284023668639053


In [103]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, pred)

0.832164411600921