|
| 1 | +""" |
| 2 | +Created on Fri Apr 7 12:43:06 2017 |
| 3 | +
|
| 4 | +@author: Robert |
| 5 | +""" |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +import pandas as pd |
| 9 | +import matplotlib.pyplot as plt |
| 10 | +plt.style.use('seaborn-deep') |
| 11 | + |
| 12 | +# Reading in data |
| 13 | +ds = pd.read_csv('Restaurant_Reviews.tsv', sep = '\t', quoting = 3) |
| 14 | + |
| 15 | +# Cleaning the text |
| 16 | +import re |
| 17 | +from nltk.stem.porter import PorterStemmer |
| 18 | +from nltk.corpus import stopwords |
| 19 | + |
| 20 | +corpus = [] |
| 21 | + |
| 22 | +# Remove non-alphabetic characters, make everything lower case, |
| 23 | +# remove stopwords, append to corpus. |
| 24 | +for i in range(len(ds)): |
| 25 | + |
| 26 | + ps = PorterStemmer() |
| 27 | + review = re.sub('[^a-zA-Z]', ' ', ds['Review'][i]) |
| 28 | + review = review.split() |
| 29 | + review = [ps.stem(w) for w in review if not w in set(stopwords.words('english'))] |
| 30 | + review = ' '.join(review) |
| 31 | + corpus.append(review) |
| 32 | + |
| 33 | +# Bag of words |
| 34 | +from sklearn.feature_extraction.text import CountVectorizer |
| 35 | + |
| 36 | +cv = CountVectorizer(lowercase = True, |
| 37 | + max_features = 1500 ) |
| 38 | + |
| 39 | +X = cv.fit_transform(corpus).toarray() |
| 40 | +y = ds['Liked'].values |
| 41 | + |
| 42 | +# Classifying |
| 43 | +from sklearn.model_selection import train_test_split |
| 44 | +from sklearn.metrics import confusion_matrix, accuracy_score |
| 45 | +from sklearn.ensemble import RandomForestClassifier |
| 46 | + |
| 47 | +X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2) |
| 48 | + |
| 49 | +clf = RandomForestClassifier(n_estimators = 100, criterion = "entropy") |
| 50 | +clf.fit(X_train, y_train) |
| 51 | +y_pred = clf.predict(X_test) |
| 52 | +cm = confusion_matrix(y_test, y_pred) |
| 53 | +acc = accuracy_score(y_test, y_pred) |
| 54 | + |
| 55 | + |
| 56 | +# Hadelin's challenge |
| 57 | + |
| 58 | +from sklearn.naive_bayes import GaussianNB |
| 59 | +from sklearn.neighbors import KNeighborsClassifier |
| 60 | +from sklearn.svm import SVC |
| 61 | +from sklearn.linear_model import LogisticRegression |
| 62 | + |
| 63 | +models = [] |
| 64 | +models.append(('Logistic regression', LogisticRegression())) |
| 65 | +models.append(('KNN', KNeighborsClassifier())) |
| 66 | +models.append(('Random forest', RandomForestClassifier(n_estimators = 100, |
| 67 | + criterion = 'entropy'))) |
| 68 | +models.append(('NB', GaussianNB())) |
| 69 | +models.append(('KernelSVM', SVC(kernel='rbf') )) |
| 70 | + |
| 71 | +for name, model in models: |
| 72 | + model.fit(X_train,y_train) |
| 73 | + y_pred = model.predict(X_test) |
| 74 | + |
| 75 | + cm = confusion_matrix(y_test, y_pred) |
| 76 | + print ("---------- model: " + name + '-------------------') |
| 77 | + print("Acurracy: " + str(accuracy_score(y_test, y_pred))) |
| 78 | + TP, TN, FP, FN = cm[1][1], cm[0][0], cm[0][1], cm[1][0] |
| 79 | + precision = TP / (TP + FP) |
| 80 | + recall = TP / (TP + FN) |
| 81 | + F1 = 2* precision * recall/(precision+recall) |
| 82 | + print("Precision: " + str( round(precision,2)) ) |
| 83 | + print("Recall: " + str(round(recall,2)) ) |
| 84 | + print("F1: " + str(round(F1,2))) |
0 commit comments