In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score, roc_curve, auc, precision_recall_curve



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))




In [None]:
all_data = pd.read_csv('/kaggle/input/-coffeemakerclassification/coffee_maker.csv')
all_data.head()

In [None]:
# creating a label column based on the product ratings. 
# Negative sentiment is the outcome of interest. 
# Ratings 1-3 -> negative sentiment. Ratings 4-5 -> positive sentiment

all_data['label'] = all_data.apply(lambda row: row.rating>3.0,axis=1)
all_data['label'] = all_data['label'].astype(int)
all_data.head()

In [None]:
# Assessing label class distribution

plt.hist(all_data['label'])
plt.gca().set(title = 'Frequency Distribution of label',ylabel='Frequency',xlabel='label')
plt.show()

In [None]:
all_data['review'] = all_data['review'].astype(str).apply(lambda x:x.lower())
all_data.head()

**Train/Test Split**

In [None]:
X = all_data['review']
y = all_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=14)
X_train.shape,X_test.shape, y_train.shape, y_test.shape

**Logistic Regression Model**

In [None]:
lr= LogisticRegression()

pipe = Pipeline([
            ('tfidf', TfidfVectorizer()), 
            ('Logistic Regression',lr),
            ])

pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
pred_prob = pipe.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, pred_prob)
precision, recall, thresholds_pr = precision_recall_curve(y_test, pred)

print("_________Logistic Regression Model_________")
print(classification_report(y_test, pred))
print('ROC AUC: ', auc(fpr, tpr))
print('Precision/Recall AUC: ', auc(precision, recall))

**SVM**

In [None]:
model = svm.SVC(probability=True)

pipe = Pipeline([
            ('tfidf', TfidfVectorizer()), 
            ('SVM',model),
            ])

pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
pred_prob = pipe.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, pred_prob)
precision, recall, thresholds_pr = precision_recall_curve(y_test, pred)

print("_________SVM_________")
print(classification_report(y_test, pred))
print('ROC AUC: ', auc(fpr, tpr))
print('Precision/Recall AUC: ', auc(precision, recall))

**Decision Tree Classifier**

In [None]:
dt = DecisionTreeClassifier(max_depth=5)

pipe = Pipeline([
            ('tfidf', TfidfVectorizer()), 
            ('Decision Tree Classifier',dt),
            ])

pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
pred_prob = pipe.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, pred_prob)
precision, recall, thresholds_pr = precision_recall_curve(y_test, pred)

print("_________Decision Tree Classifier_________")
print(classification_report(y_test, pred))
print('ROC AUC: ', auc(fpr, tpr))
print('Precision/Recall AUC: ', auc(precision, recall))

**Random Forest Classifier**

In [None]:
rf = RandomForestClassifier(max_depth=5, n_estimators=10)


pipe = Pipeline([
            ('tfidf', TfidfVectorizer()), 
            ('Random Forest Classifier',rf),
            ])

pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
pred_prob = pipe.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, pred_prob)
precision, recall, thresholds_pr = precision_recall_curve(y_test, pred)

print("_________Random Forest Classifier_________")
print(classification_report(y_test, pred))
print('ROC AUC: ', auc(fpr, tpr))
print('Precision/Recall AUC: ', auc(precision, recall))

**ANN**

In [None]:
mlp = MLPClassifier(alpha=1, max_iter=1000)


pipe = Pipeline([
            ('tfidf', TfidfVectorizer()), 
            ('ANN',mlp),
            ])

pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
pred_prob = pipe.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, pred_prob)
precision, recall, thresholds_pr = precision_recall_curve(y_test, pred)

print("_________ ANN _________")
print(classification_report(y_test, pred))
print('ROC AUC: ', auc(fpr, tpr))
print('Precision/Recall AUC: ', auc(precision, recall))

**Naive Bayes**

In [None]:
nb = MultinomialNB()
pipe = Pipeline([
            ('tfidf', TfidfVectorizer()), 
            ('Multinomial Naive Bayes',nb),
            ])

pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
pred_prob = pipe.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, pred_prob)
precision, recall, thresholds_pr = precision_recall_curve(y_test, pred)

print("_________Multinomial Naive Bayes_________")
print(classification_report(y_test, pred))
print('ROC AUC: ', auc(fpr, tpr))
print('Precision/Recall AUC: ', auc(precision, recall))

# Choice of Model 

Out of all the models, Logistic Regression model performs the best. The Precision, Recall, f1-score, accuracy, ROC AUC are all good. 
In case of Naive Bayes, the recall is low.
In case of ANN, the none of the metrics are low, but since Logistic Regression also gives similar scores, it is better to choose Logistic Regression model since it takes lesser time to train and is easier to interpret and implement. 
The precision and recall is very low in case of Random Forest Classifier so it is not a good choice. 
In case of Decision Tree Classifier, the ROC AUC, precision, recall are low and this model cannot be used. 

In [None]:
               ,
               