# Cindy Ho 127008544 User Feedback Prediction Algorithm

In [14]:
import csv #library for importing csv files
import firebase_admin #needed for connecting to database
from firebase_admin import credentials #needed for certifying password to database
from firebase_admin import firestore #needed for admin access to database
import pandas as pd #for pandas dataframe - structure of organizing data
from sklearn.model_selection import train_test_split #used to split dataset into training and testing dataset
import numpy as np 
from sklearn import neighbors #import library for KNN
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score
#imports tools needed to verify results of models
from skl2onnx import convert_sklearn


# Connection to Firebase Database

In [15]:
#the purpose of this is to gain access to the database
if not firebase_admin._apps: #using a if not statement to prevent overwriting over same password key
    #saves to cred variable of admin password key
    cred = credentials.Certificate(r"C:\Users\cindy\OneDrive\Documents\403\perfectpourovercoffee-3c0ce-firebase-adminsdk-h2jgk-1590472422.json")
    
    #calls upon firebase admin library and inserts password key to gain access to private database
    default_app = firebase_admin.initialize_app(cred)
    

db = firestore.client() #saves db as the database


# Collects user dataset from CSV file and writes to database

In [118]:
#read from my csv user feedback file and then write it to the database
rows = [] #created a list
with open(r'C:\Users\cindy\OneDrive\Documents\403\UserFeedbackS1.csv','r') as file: #opens csv input 
    csv_reader = csv.reader(file, delimiter=',') #reads from csv file
    for row in csv_reader: #iterates through csv
        rows.append(row) #appends to list while it iterates through csv
        doc_ref = { #generates brew data
            'user_id': row[0], #takes first value and sets as user id value
            'cup_size': row[1], #takes second value and sets as cup size value
            'roast_type': row[2], #takes third value and sets as roast type value
            'bean_type': row[3], #takes fourth value and sets as coffee bean type value
            'strength': row[4], #takes fifth value and sets as user preferred strength value
            'rating': row[5], #takes sixth value and sets as user rating value
            'temperature': row[6], #takes seventh value and sets as target temperature value
            'grind_size': row[7], #takes eighth value and sets as recommended grind size value
            'target_saturation': row[8] #takes ninth value and sets as target water volume value
        }
        db.collection('trainingandvalidating4').add(doc_ref) #all these datapoints are saved as one collection / folder 

# K-Nearest Neighbors Model Train & Validation

In [None]:
docs = db.collection('trainingandvalidating').stream() #reads from all documents in a collection of the database
data = pd.DataFrame() #this creates the pandas dataframe containing all of the user feedback from the database
for doc in docs: #iterates through the document stream 
    print(f'{doc.id} => {doc.to_dict()}') #shows programmer what is being saved
    data = data.append(doc.to_dict(),ignore_index=True) # appends it to my pandas dataframe

# Clean Data

In [124]:
#this replaces all of the string values of coffee bean type and converting it to float values
data['bean_type'].loc[data['bean_type'] == 'robusta'] = 1.0 #converting robusta to 1
data['bean_type'].loc[data['bean_type'] == 'arabica'] = 2.0 #converting arabica to 2
data['bean_type'].loc[data['bean_type'] == 'liberica'] = 3.0 #converting liberica to 3
data['bean_type'].loc[data['bean_type'] == 'excelsa'] = 4.0 #converting excelsa to 4

#this replaces all of the string values of grind size and converting it to float values
data['grind_size'].loc[data['grind_size'] == 'medium'] = 0.0 #converting medium to 0
data['grind_size'].loc[data['grind_size'] == 'larger'] = 1.0 #converting larger to 1
data['grind_size'].loc[data['grind_size'] == 'smaller'] = 2.0 #converting smaller to 2

#this replaces all of the string values of roast type and converting it to float values
data['roast_type'].loc[data['roast_type'] == 'mild'] = 0.0 #converting mild to 0
data['roast_type'].loc[data['roast_type'] == 'medium'] = 1.0 #converting medium to 1
data['roast_type'].loc[data['roast_type'] == 'mediumdark'] = 2.0 #converting mediumdark to 2
data['roast_type'].loc[data['roast_type'] == 'dark'] = 3.0 #converting dark to 3

#converts all strings/objects into floats
data = data._convert(numeric=True)

#deletes all non-null responses
data.dropna(subset=['user_id', #takes first value and sets as user id value
            'cup_size', #takes second value and sets as cup size value
            'roast_type', #takes third value and sets as roast type value
            'bean_type', #takes fourth value and sets as coffee bean type value
            'strength', #takes fifth value and sets as user preferred strength value
            'rating', #takes sixth value and sets as user rating value
            'temperature', #takes seventh value and sets as target temperature value
            'grind_size', #takes eighth value and sets as recommended grind size value
            'target_saturation'], inplace=True)

#checking dataframe for there's any non-null responses
data.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1895 entries, 1 to 2219
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   target_saturation  1895 non-null   float64
 1   rating             1895 non-null   float64
 2   cup_size           1895 non-null   float64
 3   grind_size         1895 non-null   float64
 4   strength           1895 non-null   float64
 5   bean_type          1895 non-null   float64
 6   temperature        1895 non-null   float64
 7   roast_type         1895 non-null   float64
 8   user_id            1895 non-null   float64
 9   water_volume       0 non-null      float64
dtypes: float64(10)
memory usage: 162.9 KB


# KNN Model for Strength

In [125]:
#first splitting 60% out for training then 40% for validation & testing
x_train, x_test, y_train, y_test = train_test_split(data[['roast_type','bean_type','rating']].values, data[['strength']].values, test_size=0.2, random_state=42) 

#then splitting the 40% into 20% validation & 20% testing
x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size=0.25, random_state=42) 

knn_strength = neighbors.KNeighborsClassifier(n_neighbors=9) #set K Nearest Neighbors model as variable 
knn_strength.fit(x_train, y_train) #fitting training independent and dependent data 
y_pred = knn_strength.predict(x_validate) #model predict independent validation dataset values

#outputs accuracy score of dependent validate values and predicted independent values
#accuracy score: calculate the accuracy of faction of correct prediction
#(TP+TN)/ (TP+FN+TN+FP)
print('accuracy =', accuracy_score(y_validate, y_pred))

#outputs confusion matrix of dependent validate values and predicted independent values
print('confusion matrix')
print(confusion_matrix(y_validate, y_pred))



accuracy = 0.6965699208443272
confusion matrix
[[ 58  38  17]
 [ 11 131   7]
 [ 13  29  75]]


  return self._fit(X, y)


# KNN Model for Temperature

In [126]:
#first splitting 60% out for training then 40% for validation & testing
x_train, x_test, y_train, y_test = train_test_split(data[['roast_type','bean_type','rating','strength']].values, data[['temperature']].values, test_size=0.2, random_state=42) 

#then splitting the 40% into 20% validation & 20% testing
x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size=0.25, random_state=42) 

knn_temp = neighbors.KNeighborsClassifier(n_neighbors=9) #set K Nearest Neighbors model as variable 
knn_temp.fit(x_train, y_train) #fitting training independent and dependent data 
y_pred = knn_temp.predict(x_validate) #model predict independent validation dataset values

#outputs accuracy score of dependent validate values and predicted independent values
#accuracy score: calculate the accuracy of faction of correct prediction
#(TP+TN)/ (TP+FN+TN+FP)
print('accuracy =', accuracy_score(y_validate, y_pred))

#outputs confusion matrix of dependent validate values and predicted independent values
print('confusion matrix')
print(confusion_matrix(y_validate, y_pred))


accuracy = 0.7044854881266491
confusion matrix
[[11  0  0  0  0  0  0  0  1  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0
   0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0]
 [ 0  0  5  0  0  0  0  0  0  0  4  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0]
 [ 0  0  0 13  0  0  0  0  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0]
 [ 0  0  0  0 20  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0]
 [ 0  0  2  0  3 24  0  0  0  0  0  0  0  0  5  0  0  0  0  0  0  0  0  0
   0  0  0  0  0]
 [ 0  0  0  0  0  0 20  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0]
 [ 0  0  0  0  0  9  0 11  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0
   0  0  0  0  0]
 [ 0  1  0  0  0  0  4  3 24  0  5  0  0  0  0  5  0  0  0  0  0  0  0  0
   0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0 10  0  0  0  0  0  0  1  0  0  3  0  0  0  0
   0  0  0  0  0]
 [ 0  0  0  0  4  0  0  2  3  0 2

  return self._fit(X, y)


# KNN Model for Water Volume

In [127]:
#first splitting 60% out for training then 40% for validation & testing
x_train, x_test, y_train, y_test = train_test_split(data[['roast_type','bean_type','rating', 'strength']].values, data[['target_saturation']].values, test_size=0.2, random_state=42) 

#then splitting the 40% into 20% validation & 20% testing
x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size=0.25, random_state=42) 

knn_sat = neighbors.KNeighborsClassifier(n_neighbors=9) #set K Nearest Neighbors model as variable 
knn_sat.fit(x_train, y_train) #fitting training independent and dependent data 
y_pred = knn_sat.predict(x_validate) #model predict independent validation dataset values

#outputs accuracy score of dependent validate values and predicted independent values
#accuracy score: calculate the accuracy of faction of correct prediction
#(TP+TN)/ (TP+FN+TN+FP)
print('accuracy =', accuracy_score(y_validate, y_pred))

#outputs confusion matrix of dependent validate values and predicted independent values
print('confusion matrix')
print(confusion_matrix(y_validate, y_pred))


accuracy = 0.7308707124010554
confusion matrix
[[ 2  0  0  1  0  0  0  0  4  0  0  0]
 [ 0 12  0  3  0  5  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  3  0 24  0  1  0  0  3  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0 60  4  0  5  0  0  0]
 [ 0  7  0  2  0  6 99  0  5  0  0  0]
 [ 0  0  0  0  0  1  1  0  0  0  0  0]
 [ 0  3  0  3  0  4  1  0 80  0  0  0]
 [ 0  0  0  0  0  7  1  0  1  0  0  0]
 [ 0  0  0  0  0  1 12  0  0  0  0  0]
 [ 0  0  0  1  0  0  1  0 14  0  0  0]]


  return self._fit(X, y)


# KNN Model for Grind Size

In [128]:
#first splitting 60% out for training then 40% for validation & testing
x_train, x_test, y_train, y_test = train_test_split(data[['roast_type','bean_type','rating','strength']].values, data[['grind_size']].values, test_size=0.2, random_state=42) 

#then splitting the 40% into 20% validation & 20% testing
x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size=0.25, random_state=42) 

knn_grind = neighbors.KNeighborsClassifier(n_neighbors=9) #set K Nearest Neighbors model as variable 
knn_grind.fit(x_train, y_train) #fitting training independent and dependent data 
y_pred = knn_grind.predict(x_validate) #model predict independent validation dataset values

#outputs accuracy score of dependent validate values and predicted independent values
#accuracy score: calculate the accuracy of faction of correct prediction
#(TP+TN)/ (TP+FN+TN+FP)
print('accuracy =', accuracy_score(y_validate, y_pred))

#outputs confusion matrix of dependent validate values and predicted independent values
print('confusion matrix')
print(confusion_matrix(y_validate, y_pred))


accuracy = 0.8073878627968337
confusion matrix
[[ 85   6  14]
 [ 16 110   4]
 [ 18  15 111]]


  return self._fit(X, y)


# KNN Model for Rating

In [129]:
#first splitting 60% out for training then 40% for validation & testing
x_train, x_test, y_train, y_test = train_test_split(data[['roast_type','bean_type','temperature','strength']].values, data[['rating']].values, test_size=0.2, random_state=42) 

#then splitting the 40% into 20% validation & 20% testing
x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size=0.25, random_state=42) 

knn_rating = neighbors.KNeighborsClassifier(n_neighbors=9) #set K Nearest Neighbors model as variable 
knn_rating.fit(x_train, y_train) #fitting training independent and dependent data 
y_pred = knn_rating.predict(x_validate) #model predict independent validation dataset values

#outputs accuracy score of dependent validate values and predicted independent values
#accuracy score: calculate the accuracy of faction of correct prediction
#(TP+TN)/ (TP+FN+TN+FP)
print('accuracy =', accuracy_score(y_validate, y_pred))

#outputs confusion matrix of dependent validate values and predicted independent values
print('confusion matrix')
print(confusion_matrix(y_validate, y_pred))


accuracy = 0.7598944591029023
confusion matrix
[[17  0  0  0  0  0  4  4  0  4]
 [ 0 21  4  0  0  0  0  0  0  5]
 [ 0  0 22  1  0  0  2  0  1  0]
 [ 0  1  0 36  0  0  0  0  1  0]
 [ 0  0  0  0 27  3  5  0  3  1]
 [ 0  0 13  0  0 27  4  0  1  0]
 [ 0  0  1  0  0  2 38  0  0  0]
 [ 0  1  0  3  1  0  1 45  6  0]
 [ 0  2  3  5  0  0  0  7 38  1]
 [ 0  0  0  0  0  0  0  1  0 17]]


  return self._fit(X, y)
