In [145]:
# Import the libraries
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB

In [146]:
def setLabel(score: float) -> str:
    """ Check the score from the data and give back the label

    Base on the rating score, compare with the enum "Rating" to set
    the label for each record.

    Arg:
        score(float): The number from each record.

    Return:
        resultStr(str): The label base on different input.

    """
    pass

    resultStr = ""
    if 0.0 <= score <= 1.0:
        resultStr = "Terrible"
    elif 1.0 < score <= 2.0:
        resultStr = "Poor"
    elif 2.0 < score <= 3.0:
        resultStr = "Average"
    elif 3.0 < score <= 4.0:
        resultStr = "VeryGood"
    elif 4.0 < score <= 5.0:
        resultStr = "Excellent"
    else:
        resultStr = "No rating scores"
    return resultStr


In [234]:
def readCSV(fileName: str):
    """ Read the .csv file and combine all data into a large dataset

    Read .csv file from current folder, put all records into dataframe, and 
    return the dataset and labels.

    Arg:
        fileName(str): The string that contain the .csv file location.

    Return:
        A dataframe that contain all record from .csv file.
        The labels for all features.

    Rasies:
        File Error: File not exist or not in the correct location.
    """
    pass

    try:
        df = pd.read_csv(fileName)
        X = df.drop(['UserID'], axis=1)
        label = (df['Art_galleries'] +
                  df['Dance_clubs'] + 
                  df['Juice_bars'] + 
                  df['Restaurants'] + 
                  df['Museums'] + 
                  df['Resorts'] + 
                  df['Picnic_spots'] + 
                  df['Beaches'] + 
                  df['Theaters'] + 
                  df['Religious_institutions']) / 10
        labels = label.apply(setLabel)        
        return X, labels
    except:
        print("Open file error")

In [300]:
fileName = "tripadvisor_review.csv"
X, labels = readCSV(fileName)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

dt_clf = DecisionTreeClassifier(criterion="gini", splitter="best")
# train the data
dt_clf.fit(X_train, y_train)
# score
dt_clf.score(X_test, y_test)
# classification X_test, return labels
clf_y_pred = dt_clf.predict(X_test)
n = accuracy_score(y_test, clf_y_pred, normalize=False)
# print(n)
# print(X.shape[0])
# print(X_test.shape[0])
# print(n / X_test.shape[0])
# print((n / X_test.shape[0]) * X.shape[0])
# print(((n / X_test.shape[0]) * X.shape[0]) / X.shape[0])

# clf_probs = dt_clf.predict_proba(X_test).max(axis = 1)
# print(clf_probs)

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_y_pred = nb_model.predict(X_test)
accuracy_score(y_test, nb_y_pred)
nb_probs = nb_model.predict_proba(X_test).max(axis = 1)
# print(nb_probs)
low_confidence = nb_probs < 0.9
# print(low_confidence)
c = (low_confidence != True).sum()
print(c / X_test.shape[0])
ic = (low_confidence == True).sum()
print(ic/ X_test.shape[0])

0.9387755102040817
0.061224489795918366


In [301]:
fileName = "tripadvisor_review.csv"
X, y = readCSV(fileName)

clf_avg_score_result = []
clf_prob_score_correct_result = []
clf_prob_score_incorrect_result = []
nb_avg_score_result = []
nb_prob_score_correct_result = []
nb_prob_score_incorrect_result = []

for i in range(50):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=None)

    clf_model = DecisionTreeClassifier(criterion="gini", splitter="best")
    clf_model.fit(X_train, y_train)
    clf_score = clf_model.score(X_test, y_test)
    clf_avg_score_result.append(clf_score)
    clf_probs = clf_model.predict_proba(X_test).max(axis = 1)
    low_confidence_clf = clf_probs < 0.9
    clf_avg_score_result.append(clf_score)
    clf_prob_score_correct_result.append((low_confidence_clf != True).sum() / X_test.shape[0])
    clf_prob_score_incorrect_result.append((low_confidence_clf == True).sum() / X_test.shape[0])

    # Apply GaussianNB
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)
    nb_score = nb_model.score(X_test, y_test)
    nb_probs = nb_model.predict_proba(X_test).max(axis = 1)
    low_confidence_nb = nb_probs < 0.9
    nb_avg_score_result.append(nb_score)
    nb_prob_score_correct_result.append((low_confidence_nb != True).sum() / X_test.shape[0])
    nb_prob_score_incorrect_result.append((low_confidence_nb == True).sum() / X_test.shape[0])

print(np.round(np.mean(np.array(clf_avg_score_result)), 6) * 100)
print(np.round(np.mean(np.array(clf_prob_score_correct_result)), 6) * 100)
print(np.round(np.mean(np.array(clf_prob_score_incorrect_result)), 6) * 100)
print(np.round(np.mean(np.array(nb_avg_score_result)), 6) * 100)
print(np.round(np.mean(np.array(nb_prob_score_correct_result)), 6) * 100)
print(np.round(np.mean(np.array(nb_prob_score_incorrect_result)), 6) * 100)


97.6653
100.0
0.0
95.4939
92.0898
7.910200000000001
