## Introduction to Natural Language Processing using Python

In [190]:
# Natural Language processing is the process of using Machine Learning models to train on text data.

In [127]:
# Example: Automatically categorizing movie reviews as good/bad.
# chatbots

In [128]:
# NLP mostly uses classification type machine learning agorithms like logistic regression and Naive Bayes

In [191]:
import numpy as np
import pandas as pd

In [131]:
# Importing the dataset. \t is tab delimiter, quoting = 3 will ignore double quotes so that it wont cause any problem
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [192]:
dataset.head(10)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [133]:
#Cleaning the text data:

In [1]:
import re
import nltk

In [135]:
dataset['Review'][0]

'Wow... Loved this place.'

In [136]:
#replace everything except a to z and A to Z with a space. (to remove numbers and any other characters )
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][0])
review

'Wow    Loved this place '

In [137]:
review=review.lower()
review

'wow    loved this place '

In [2]:
#next we will remove stop words. stop words are generally used words like 'a' 'an' 'the' 'them' etc. which is not so useful for 
#machine learning algorithms.
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Vinay Kumar
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords

In [4]:
print(set(stopwords.words('english')))

{'we', 'her', 'had', 'these', 'there', 'weren', 'ain', 'm', 'ma', "should've", 'those', 'more', 'yourselves', 'in', 'its', 'now', 'theirs', 'he', 'before', 'aren', 's', 'his', 't', "shan't", 'over', 'all', "she's", 'once', 'when', 'himself', 'have', 'during', 'below', 'under', 'any', 'hasn', "doesn't", 'ourselves', 'who', 'and', "it's", 'it', "haven't", 'where', 'some', 'should', 'been', 'no', 'herself', 'wouldn', 'itself', 'they', 'needn', 'their', 'me', 'don', "couldn't", "that'll", 'while', 'won', 'hers', "wouldn't", 'here', 'too', 'not', "you'd", 'again', 'hadn', "mustn't", 'against', 'ours', 'but', 'y', 'him', 'whom', 'i', 'just', 've', 'she', 'how', 'were', 'such', "don't", "hadn't", 'the', 'if', "wasn't", 'so', 'further', "mightn't", 'that', 'am', 'on', 'does', 'for', "weren't", 'couldn', 'above', 'myself', 'most', 'own', "needn't", 'can', 're', 'd', "aren't", 'my', 'didn', "hasn't", 'out', "shouldn't", 'do', 'has', 'other', 'yours', 'down', 'into', "you're", 'which', "isn't", '

In [141]:
review = review.split()
review

['wow', 'loved', 'this', 'place']

In [142]:
review = [word for word in review if not word in set(stopwords.words('english'))]
review

['wow', 'loved', 'place']

In [143]:
#Next we will look how to do stemming. 
#Stemming is the process of converting the given word to its root word.
#example: love is the root word of loved, loving etc.,
#This is done so as to remove duplicate words and reduce the number of columns in sparse matrix.

In [144]:
from nltk.stem import PorterStemmer

In [145]:
ps = PorterStemmer()

In [146]:
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review

['wow', 'love', 'place']

In [147]:
#joining all the words to form the string again:
review = ' '.join(review)
review

'wow love place'

In [148]:
dataset.shape

(1000, 2)

In [149]:
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [150]:
dataset.Review.count()

1000

In [151]:
#Creating bag of words model.

In [152]:
#machine learning models only works with numerical data. Thus we need to covert our text data to numerical data.
#This is called as Bag of Words. The converted data is also called as 'Sparse Matrix'

In [153]:
from sklearn.feature_extraction.text import CountVectorizer

In [154]:
#Example to understand fit method:
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']
vect = CountVectorizer()
#fit method to learn vocabulary from the given data.
vect.fit(simple_train)
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

In [155]:
#Once fit, we can use transform method to convert given data to sparse matrix
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [156]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [157]:
simple_test = ["please don't call me"]
simple_test_dtm = vect.transform(simple_test)
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


In [158]:
#Lets do fit and transform  on our actual data

In [159]:
cv = CountVectorizer()

In [160]:
#fit method to learn vocabulary from the given data.
cv.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [161]:
len(cv.get_feature_names())

1565

In [162]:
#first 10 features
cv.get_feature_names()[0:10]

['absolut',
 'absolutley',
 'accid',
 'accommod',
 'accomod',
 'accordingli',
 'account',
 'ach',
 'acknowledg',
 'across']

In [163]:
X = cv.transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [164]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [165]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [166]:
# train the model using X_train_dtm
%time logreg.fit(X_train, y_train)

Wall time: 17 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [167]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test)

In [168]:
from sklearn import metrics

In [169]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.70999999999999996

In [170]:
#testing with our own review:

In [194]:
reviews = ['the taste was very good','it was very bad', 'nice, will come again']

In [200]:
X = cv.transform(reviews).toarray()

In [201]:
pred = logreg.predict(X)

In [197]:
pred[0:3]

array([1, 0, 1], dtype=int64)