In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('model_building_dataset1.csv')
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Hotel_Review\\model_building_dataset1.csv'

# Feature Engineering

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
model = SelectKBest(score_func=chi2, k='all')
fit = model.fit(data.iloc[:,1:], data.iloc[:,0])
scores = np.around(fit.scores_, 3)
scores

In [None]:
idx_cols = list(np.where(scores>0.5)[0])
idx_cols = [x+1 for x in idx_cols]
idx_cols[:5]

In [None]:
data.shape

In [None]:
data = pd.concat([data.iloc[:,0],data.iloc[:,idx_cols]], axis=1)
data.head()

In [None]:
data.shape

# Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix

In [None]:
X = data.iloc[:,1:]
y = data.iloc[:,0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

## 1).Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

+ Since we are going to use One Vs Rest algorithm, set **multi_class='ovr'**
+ Note: since we are using One Vs Rest algorithm we must use **'liblinear' solver** with it.

In [None]:
model = LogisticRegression(class_weight='balanced', multi_class='ovr', solver='liblinear')
model.fit(X_train, y_train)

In [None]:
y_train_pred = model.predict(X_train)
print('Accuracy Score: ',round(accuracy_score(y_train, y_train_pred),3))
print('F1 Score: ',round(f1_score(y_train, y_train_pred, average='weighted'),3))
print('Precision Score: ',round(precision_score(y_train, y_train_pred, average='weighted'),3))
print('Recall Score: ',round(recall_score(y_train, y_train_pred, average='weighted'),3))

In [None]:
cm = confusion_matrix(y_train, y_train_pred)
dt = {'Negative':list(cm[0]), 'Neutral':list(cm[1]), 'Positive':list(cm[2])}
cm_df = pd.DataFrame(dt, index=['Negative', 'Neutral', 'Positive'])

plt.figure(figsize=(8,6))
sns.heatmap(cm_df,annot=True,fmt='.0f', linewidths=1)
plt.ylabel('Predictions', fontsize=18)
plt.xlabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
y_test_pred = model.predict(X_test)
print('Accuracy Score: ',round(accuracy_score(y_test, y_test_pred),3))
print('F1 Score: ',round(f1_score(y_test, y_test_pred, average='weighted'),3))
print('Precision Score: ',round(precision_score(y_test, y_test_pred, average='weighted'),3))
print('Recall Score: ',round(recall_score(y_test, y_test_pred, average='weighted'),3))

In [None]:
print('Classification Report: \n',classification_report(y_test, y_test_pred, digits=3))

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
dt = {'Negative':list(cm[0]), 'Neutral':list(cm[1]), 'Positive':list(cm[2])}
cm_df = pd.DataFrame(dt, index=['Negative', 'Neutral', 'Positive'])

plt.figure(figsize=(8,6))
sns.heatmap(cm_df,annot=True,fmt='.0f')
plt.ylabel('Predictions', fontsize=18)
plt.xlabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

## 2).K-NN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
model = KNeighborsClassifier()

kfold = KFold(n_splits=5)
result = cross_val_score(model, X_train, y_train, cv=kfold)
result.mean()

### Hyperparameter Tunning

In [None]:
#Elbow Plot
acc = []
for k in range(1, 11):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train , y_train)
    y_predict = knn.predict(X_train)
    acc.append(accuracy_score(y_train , y_predict))
    
plt.plot(range(1, 11), acc)
plt.title('Elbow Method')
plt.xlabel('Number of Neighbour')
plt.ylabel('ACCURACY')
plt.show()

In [None]:
# Hyper parameter Value
kfold = KFold()
n_neighbors = np.array(range(1,10))
param_grid = {'n_neighbors':n_neighbors}

# Hyper parameter tunning using GridSearchCV
model = KNeighborsClassifier()
grid = GridSearchCV(estimator=model, param_grid=param_grid , cv = kfold, n_jobs=2)
grid.fit(X_train, y_train)

In [None]:
print(grid.best_score_)
print(grid.best_params_)

In [None]:
para = grid.best_params_

### K-NN Model Building

In [None]:
model = KNeighborsClassifier(n_neighbors=para['n_neighbors'])
model.fit(X_train, y_train)

In [None]:
y_train_pred = model.predict(X_train)
print('Training Scores:-')
print('Accuracy Score: ',round(accuracy_score(y_train, y_train_pred),3))
print('F1 Score: ',round(f1_score(y_train, y_train_pred, average='weighted'),3))
print('Precision Score: ',round(precision_score(y_train, y_train_pred, average='weighted'),3))
print('Recall Score: ',round(recall_score(y_train, y_train_pred, average='weighted'),3))

y_test_pred = model.predict(X_test)
print('\nTesting Scores:-')
print('Accuracy Score: ',round(accuracy_score(y_test, y_test_pred),3))
print('F1 Score: ',round(f1_score(y_test, y_test_pred, average='weighted'),3))
print('Precision Score: ',round(precision_score(y_test, y_test_pred, average='weighted'),3))
print('Recall Score: ',round(recall_score(y_test, y_test_pred, average='weighted'),3))

In [None]:
cm = confusion_matrix(y_train, y_train_pred)
dt = {'Negative':list(cm[0]), 'Neutral':list(cm[1]), 'Positive':list(cm[2])}
cm_df = pd.DataFrame(dt, index=['Negative', 'Neutral', 'Positive'])

plt.figure(figsize=(8,6))
sns.heatmap(cm_df,annot=True,fmt='.0f', linewidths=1)
plt.ylabel('Predictions', fontsize=18)
plt.xlabel('Actuals', fontsize=18)
plt.title('Training Confusion Matrix', fontsize=18)
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
dt = {'Negative':list(cm[0]), 'Neutral':list(cm[1]), 'Positive':list(cm[2])}
cm_df = pd.DataFrame(dt, index=['Negative', 'Neutral', 'Positive'])

plt.figure(figsize=(8,6))
sns.heatmap(cm_df,annot=True,fmt='.0f')
plt.ylabel('Predictions', fontsize=18)
plt.xlabel('Actuals', fontsize=18)
plt.title('Testing Confusion Matrix', fontsize=18)
plt.show()

## 3).Naive Bayes classifier 

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
model = MultinomialNB()

kfold = KFold(n_splits=5)
result = cross_val_score(model, data.iloc[:,1:], data.iloc[:,0], cv=kfold)
result.mean()

### Hyperparameter Tunning

In [None]:
# Hyper parameter Value
kfold = KFold()
alpha = np.arange(0.1, 1.1, 0.1)
param_grid = {'alpha':alpha}

# Hyper parameter tunning using GridSearchCV
model = MultinomialNB()
grid = GridSearchCV(estimator=model, param_grid=param_grid , cv = kfold, n_jobs=2)
grid.fit(X_train, y_train)

In [None]:
print(grid.best_score_)
print(grid.best_params_)

In [None]:
para = gcv.best_params_

### Naive Bayes Model Building

In [None]:
model = MultinomialNB(alpha=para['alpha'])

model.fit(X_train, y_train)

In [None]:
y_train_pred = model.predict(X_train)
print('Training Scores:-')
print('Accuracy Score: ',round(accuracy_score(y_train, y_train_pred),3))
print('F1 Score: ',round(f1_score(y_train, y_train_pred, average='weighted'),3))
print('Precision Score: ',round(precision_score(y_train, y_train_pred, average='weighted'),3))
print('Recall Score: ',round(recall_score(y_train, y_train_pred, average='weighted'),3))

y_test_pred = model.predict(X_test)
print('\nTesting Scores:-')
print('Accuracy Score: ',round(accuracy_score(y_test, y_test_pred),3))
print('F1 Score: ',round(f1_score(y_test, y_test_pred, average='weighted'),3))
print('Precision Score: ',round(precision_score(y_test, y_test_pred, average='weighted'),3))
print('Recall Score: ',round(recall_score(y_test, y_test_pred, average='weighted'),3))

In [None]:
cm = confusion_matrix(y_train, y_train_pred)
dt = {'Negative':list(cm[0]), 'Neutral':list(cm[1]), 'Positive':list(cm[2])}
cm_df = pd.DataFrame(dt, index=['Negative', 'Neutral', 'Positive'])

plt.figure(figsize=(8,6))
sns.heatmap(cm_df,annot=True,fmt='.0f', linewidths=1)
plt.ylabel('Predictions', fontsize=18)
plt.xlabel('Actuals', fontsize=18)
plt.title('Training Confusion Matrix', fontsize=18)
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
dt = {'Negative':list(cm[0]), 'Neutral':list(cm[1]), 'Positive':list(cm[2])}
cm_df = pd.DataFrame(dt, index=['Negative', 'Neutral', 'Positive'])

plt.figure(figsize=(8,6))
sns.heatmap(cm_df,annot=True,fmt='.0f')
plt.ylabel('Predictions', fontsize=18)
plt.xlabel('Actuals', fontsize=18)
plt.title('Testing Confusion Matrix', fontsize=18)
plt.show()

## 4). Decision Tree

In [None]:
from sklearn.tree import  DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()

kfold = KFold(n_splits=5)
result = cross_val_score(model, X_train, y_train, cv=kfold)
result.mean()
# model.fit(X_train, y_train)

### Hyperparameter Tunning

In [None]:
params = {'criterion':['entropy','gini'] ,'max_depth': [2,4,6,8,10,12], 'min_samples_split': [2,3,4]}

model_test = DecisionTreeClassifier()
gcv = GridSearchCV(estimator=model_test,param_grid=params)
gcv.fit(X_train,y_train)

In [None]:
print(gcv.best_score_)
print(gcv.best_params_)

In [None]:
para = gcv.best_params_

### Decision Tree Model Building

In [None]:
model = DecisionTreeClassifier(criterion=para['criterion'], max_depth=para['max_depth'], 
                               min_samples_split=para['min_samples_split'])

model.fit(X_train, y_train)

In [None]:
y_train_pred = model.predict(X_train)
print('Training Scores:-')
print('Accuracy Score: ',round(accuracy_score(y_train, y_train_pred),3))
print('F1 Score: ',round(f1_score(y_train, y_train_pred, average='weighted'),3))
print('Precision Score: ',round(precision_score(y_train, y_train_pred, average='weighted'),3))
print('Recall Score: ',round(recall_score(y_train, y_train_pred, average='weighted'),3))

y_test_pred = model.predict(X_test)
print('\nTesting Scores:-')
print('Accuracy Score: ',round(accuracy_score(y_test, y_test_pred),3))
print('F1 Score: ',round(f1_score(y_test, y_test_pred, average='weighted'),3))
print('Precision Score: ',round(precision_score(y_test, y_test_pred, average='weighted'),3))
print('Recall Score: ',round(recall_score(y_test, y_test_pred, average='weighted'),3))

In [None]:
cm = confusion_matrix(y_train, y_train_pred)
dt = {'Negative':list(cm[0]), 'Neutral':list(cm[1]), 'Positive':list(cm[2])}
cm_df = pd.DataFrame(dt, index=['Negative', 'Neutral', 'Positive'])

plt.figure(figsize=(8,6))
sns.heatmap(cm_df,annot=True,fmt='.0f', linewidths=1)
plt.ylabel('Predictions', fontsize=18)
plt.xlabel('Actuals', fontsize=18)
plt.title('Training Confusion Matrix', fontsize=18)
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
dt = {'Negative':list(cm[0]), 'Neutral':list(cm[1]), 'Positive':list(cm[2])}
cm_df = pd.DataFrame(dt, index=['Negative', 'Neutral', 'Positive'])

plt.figure(figsize=(8,6))
sns.heatmap(cm_df,annot=True,fmt='.0f')
plt.ylabel('Predictions', fontsize=18)
plt.xlabel('Actuals', fontsize=18)
plt.title('Testing Confusion Matrix', fontsize=18)
plt.show()