Skip to content

Commit 8ff7d0f

Browse files
Deepak HonakeriDeepak Honakeri
Deepak Honakeri
authored and
Deepak Honakeri
committed
issue #57 : Added NLP algorithm in Python and R
1 parent d34a487 commit 8ff7d0f

File tree

3 files changed

+1122
-0
lines changed

3 files changed

+1122
-0
lines changed

Machine_Learning/src/NLP/NLP.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""
2+
Created on Fri Apr 7 12:43:06 2017
3+
4+
@author: Robert
5+
"""
6+
7+
import numpy as np
8+
import pandas as pd
9+
import matplotlib.pyplot as plt
10+
plt.style.use('seaborn-deep')
11+
12+
# Reading in data
13+
ds = pd.read_csv('Restaurant_Reviews.tsv', sep = '\t', quoting = 3)
14+
15+
# Cleaning the text
16+
import re
17+
from nltk.stem.porter import PorterStemmer
18+
from nltk.corpus import stopwords
19+
20+
corpus = []
21+
22+
# Remove non-alphabetic characters, make everything lower case,
23+
# remove stopwords, append to corpus.
24+
for i in range(len(ds)):
25+
26+
ps = PorterStemmer()
27+
review = re.sub('[^a-zA-Z]', ' ', ds['Review'][i])
28+
review = review.split()
29+
review = [ps.stem(w) for w in review if not w in set(stopwords.words('english'))]
30+
review = ' '.join(review)
31+
corpus.append(review)
32+
33+
# Bag of words
34+
from sklearn.feature_extraction.text import CountVectorizer
35+
36+
cv = CountVectorizer(lowercase = True,
37+
max_features = 1500 )
38+
39+
X = cv.fit_transform(corpus).toarray()
40+
y = ds['Liked'].values
41+
42+
# Classifying
43+
from sklearn.model_selection import train_test_split
44+
from sklearn.metrics import confusion_matrix, accuracy_score
45+
from sklearn.ensemble import RandomForestClassifier
46+
47+
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)
48+
49+
clf = RandomForestClassifier(n_estimators = 100, criterion = "entropy")
50+
clf.fit(X_train, y_train)
51+
y_pred = clf.predict(X_test)
52+
cm = confusion_matrix(y_test, y_pred)
53+
acc = accuracy_score(y_test, y_pred)
54+
55+
56+
# Hadelin's challenge
57+
58+
from sklearn.naive_bayes import GaussianNB
59+
from sklearn.neighbors import KNeighborsClassifier
60+
from sklearn.svm import SVC
61+
from sklearn.linear_model import LogisticRegression
62+
63+
models = []
64+
models.append(('Logistic regression', LogisticRegression()))
65+
models.append(('KNN', KNeighborsClassifier()))
66+
models.append(('Random forest', RandomForestClassifier(n_estimators = 100,
67+
criterion = 'entropy')))
68+
models.append(('NB', GaussianNB()))
69+
models.append(('KernelSVM', SVC(kernel='rbf') ))
70+
71+
for name, model in models:
72+
model.fit(X_train,y_train)
73+
y_pred = model.predict(X_test)
74+
75+
cm = confusion_matrix(y_test, y_pred)
76+
print ("---------- model: " + name + '-------------------')
77+
print("Acurracy: " + str(accuracy_score(y_test, y_pred)))
78+
TP, TN, FP, FN = cm[1][1], cm[0][0], cm[0][1], cm[1][0]
79+
precision = TP / (TP + FP)
80+
recall = TP / (TP + FN)
81+
F1 = 2* precision * recall/(precision+recall)
82+
print("Precision: " + str( round(precision,2)) )
83+
print("Recall: " + str(round(recall,2)) )
84+
print("F1: " + str(round(F1,2)))

0 commit comments

Comments
 (0)