# EVWD Daily Water Consumption Data K-Nearest-Neighbors Model 

### Variables Being Tested:  
Consumption Bin Size: 40 

In [128]:
# Import libraries
import numpy as np      
import pandas as pd    
from IPython.display import display
from sklearn                        import metrics, svm
from sklearn.linear_model           import LogisticRegression
from sklearn import preprocessing
from sklearn import utils
import pickle

In [129]:
# Read in the data 
filename = '../CleanedData/AllData10000Bins.csv'
all_data = pd.read_csv(filename)      
print(f"{filename} : file read into a pandas dataframe.")

../CleanedData/AllData10000Bins.csv : file read into a pandas dataframe.


In [130]:
# Look at the data 
display(all_data)
all_data.info()

Unnamed: 0,Holiday,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),DayOfWeek,ZoneNum,Consumption
0,0,0.19,0.00,470,13.2,93.5,60.6,76.0,76,21,43,52.0,3.1,75.2,73.2,0,0,0
1,0,0.20,0.00,478,12.8,92.5,59.2,75.8,75,21,42,51.1,3.3,79.4,73.2,1,0,0
2,0,0.19,0.00,463,12.7,92.2,61.0,76.1,67,22,41,50.9,3.0,73.1,73.0,2,0,0
3,0,0.20,0.00,465,11.4,94.4,61.2,76.7,61,16,36,47.9,3.1,73.4,73.0,3,0,0
4,1,0.19,0.00,479,11.9,90.7,58.3,74.1,65,20,41,49.0,3.0,73.1,72.1,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6283,0,0.06,0.07,265,6.1,50.6,34.9,42.9,90,37,65,31.8,3.8,92.0,52.7,2,11,2031
6284,0,0.11,0.00,445,4.0,58.1,30.2,43.9,71,18,41,21.7,3.0,72.2,51.4,3,11,999
6285,0,0.13,0.01,446,4.0,64.2,33.2,48.8,67,16,34,21.6,3.4,82.6,51.5,4,11,66
6286,0,0.14,0.00,438,3.5,68.2,34.4,51.6,54,13,27,18.5,3.2,76.5,51.6,5,11,1044


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6288 entries, 0 to 6287
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Holiday               6288 non-null   int64  
 1   ETo (in)              6288 non-null   float64
 2   Precip (in)           6288 non-null   float64
 3   Sol Rad (Ly/day)      6288 non-null   int64  
 4   Avg Vap Pres (mBars)  6288 non-null   float64
 5   Max Air Temp (F)      6288 non-null   float64
 6   Min Air Temp (F)      6288 non-null   float64
 7   Avg Air Temp (F)      6288 non-null   float64
 8   Max Rel Hum (%)       6288 non-null   int64  
 9   Min Rel Hum (%)       6288 non-null   int64  
 10  Avg Rel Hum (%)       6288 non-null   int64  
 11  Dew Point (F)         6288 non-null   float64
 12  Avg Wind Speed (mph)  6288 non-null   float64
 13  Wind Run (miles)      6288 non-null   float64
 14  Avg Soil Temp (F)     6288 non-null   float64
 15  DayOfWeek            

In [131]:
# Save the list of columns in a variable
COLUMNS = all_data.columns 

# Create a dictionary to look up any column index by name
COL_INDEX = {}
for i, name in enumerate(COLUMNS):
    COL_INDEX[name] = i  # using the name (as key), look up the value (i)
print(f"COL_INDEX is {COL_INDEX}\n\n")


COL_INDEX is {'Holiday': 0, 'ETo (in)': 1, 'Precip (in)': 2, 'Sol Rad (Ly/day)': 3, 'Avg Vap Pres (mBars)': 4, 'Max Air Temp (F)': 5, 'Min Air Temp (F)': 6, 'Avg Air Temp (F)': 7, 'Max Rel Hum (%)': 8, 'Min Rel Hum (%)': 9, 'Avg Rel Hum (%)': 10, 'Dew Point (F)': 11, 'Avg Wind Speed (mph)': 12, 'Wind Run (miles)': 13, 'Avg Soil Temp (F)': 14, 'DayOfWeek': 15, 'ZoneNum': 16, 'Consumption': 17}




In [132]:
# convert our dataframe to a numpy array, named A
A = all_data.to_numpy()   
print(A)

[[0.000e+00 1.900e-01 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 2.000e-01 0.000e+00 ... 1.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 1.900e-01 0.000e+00 ... 2.000e+00 0.000e+00 0.000e+00]
 ...
 [0.000e+00 1.300e-01 1.000e-02 ... 4.000e+00 1.100e+01 6.600e+01]
 [0.000e+00 1.400e-01 0.000e+00 ... 5.000e+00 1.100e+01 1.044e+03]
 [0.000e+00 1.500e-01 0.000e+00 ... 6.000e+00 1.100e+01 3.800e+01]]


In [133]:
# Convert all data to floats
A = A.astype('float64')  
print(A)

[[0.000e+00 1.900e-01 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 2.000e-01 0.000e+00 ... 1.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 1.900e-01 0.000e+00 ... 2.000e+00 0.000e+00 0.000e+00]
 ...
 [0.000e+00 1.300e-01 1.000e-02 ... 4.000e+00 1.100e+01 6.600e+01]
 [0.000e+00 1.400e-01 0.000e+00 ... 5.000e+00 1.100e+01 1.044e+03]
 [0.000e+00 1.500e-01 0.000e+00 ... 6.000e+00 1.100e+01 3.800e+01]]


In [134]:
# Get the NUM_ROWS and NUM_COLS 
NUM_ROWS, NUM_COLS = A.shape
print(f"\nThe dataset has {NUM_ROWS} rows and {NUM_COLS} cols")


The dataset has 6288 rows and 18 cols


## Staring the Data Set Up for the Model:

In [135]:
# Split the data into two
X_all = A[:,0:NUM_COLS-1]  # X (features) ... is all rows, columns 0-16
y_all = A[:,NUM_COLS-1]    # y (labels) ... is all rows, last column only

In [136]:
# Scramble the data to remove (potential) dependence on its ordering: 
indices = np.random.permutation(len(y_all))  # indices is a permutation-list

# Scramble both X and y with the same permutation
X_permed = X_all[indices]              
y_permed = y_all[indices]             
print(f"The scrambled labels/species are \n {y_permed}")
print(f"The corresponding data rows are \n {X_permed[0:5]}")

The scrambled labels/species are 
 [33.  7.  0. ...  1.  0.  0.]
The corresponding data rows are 
 [[0.000e+00 1.400e-01 0.000e+00 4.300e+02 4.000e+00 6.770e+01 4.220e+01
  5.580e+01 8.000e+01 1.600e+01 2.600e+01 2.190e+01 4.800e+00 1.144e+02
  5.290e+01 3.000e+00 1.100e+01]
 [0.000e+00 1.300e-01 0.000e+00 3.640e+02 9.600e+00 8.780e+01 5.290e+01
  6.860e+01 7.200e+01 1.700e+01 4.000e+01 4.350e+01 2.600e+00 6.170e+01
  6.330e+01 4.000e+00 6.000e+00]
 [0.000e+00 1.600e-01 0.000e+00 4.100e+02 5.200e+00 6.640e+01 4.060e+01
  5.350e+01 6.900e+01 1.500e+01 3.700e+01 2.820e+01 6.800e+00 1.640e+02
  5.250e+01 5.000e+00 4.000e+00]
 [0.000e+00 6.000e-02 0.000e+00 1.860e+02 1.030e+01 6.300e+01 5.340e+01
  5.690e+01 8.200e+01 4.800e+01 6.500e+01 4.540e+01 5.100e+00 1.222e+02
  6.350e+01 2.000e+00 5.000e+00]
 [0.000e+00 2.700e-01 0.000e+00 6.790e+02 1.470e+01 9.640e+01 6.100e+01
  7.730e+01 8.300e+01 1.700e+01 4.600e+01 5.480e+01 4.200e+00 1.002e+02
  7.530e+01 5.000e+00 1.000e+00]]


In [137]:
# Seperate the data into test data and training data 
#     Train the model with the training data (do not look at the testing data)
#     Then test with the testing data to see how accurate the model is 
# Use an 80% training, 20% testing data split 

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

print(f"training with {len(y_train)} rows;  testing with {len(y_test)} rows\n" )

print(f"Held-out data... (testing data: {len(y_test)})")
print(f"y_test: {y_test}\n")
print(f"X_test (few rows): {X_test[0:5,:]}")  # 5 rows
print()
print(f"Data used for modeling... (training data: {len(y_train)})")
print(f"y_train: {y_train}\n")
print(f"X_train (few rows): {X_train[0:5,:]}")  # 5 rows

training with 5030 rows;  testing with 1258 rows

Held-out data... (testing data: 1258)
y_test: [ 48. 105.  60. ...   0.   1.   6.]

X_test (few rows): [[0.000e+00 2.200e-01 0.000e+00 6.350e+02 1.380e+01 7.980e+01 5.530e+01
  6.500e+01 9.000e+01 3.900e+01 6.600e+01 5.320e+01 4.900e+00 1.166e+02
  6.880e+01 5.000e+00 8.000e+00]
 [0.000e+00 1.300e-01 2.000e-02 4.350e+02 4.100e+00 7.460e+01 3.840e+01
  5.610e+01 5.000e+01 1.300e+01 2.600e+01 2.210e+01 3.200e+00 7.760e+01
  5.260e+01 4.000e+00 6.000e+00]
 [0.000e+00 2.100e-01 0.000e+00 5.080e+02 1.790e+01 9.590e+01 6.750e+01
  7.950e+01 8.100e+01 2.700e+01 5.200e+01 6.040e+01 4.500e+00 1.085e+02
  7.650e+01 4.000e+00 1.100e+01]
 [0.000e+00 2.600e-01 0.000e+00 5.740e+02 1.690e+01 9.910e+01 6.650e+01
  8.310e+01 7.700e+01 2.600e+01 4.400e+01 5.880e+01 4.200e+00 9.990e+01
  7.510e+01 6.000e+00 9.000e+00]
 [0.000e+00 1.000e-01 2.000e-02 3.440e+02 3.600e+00 6.920e+01 3.760e+01
  5.260e+01 4.900e+01 1.200e+01 2.600e+01 1.910e+01 3.000e+00 7.260e

## Model Building Begins:

In [138]:
# Build first KNN Model
from sklearn.neighbors import KNeighborsClassifier

k = 84   # we don't know what k to use, so this first value is a guess!  
knn_model = KNeighborsClassifier(n_neighbors=k)       

# we train the model (it's one line!)
knn_model.fit(X_train, y_train)                              
print("Created and trained a knn classifier with k =", k)  

Created and trained a knn classifier with k = 84


### Test the first version of the model:

In [139]:
# Use the testing data to see how well our current model does
predicted_labels = knn_model.predict(X_test)   
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual  labels  :", actual_labels)

# And, some overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total.")

Predicted labels: [0. 0. 0. ... 0. 0. 0.]
Actual  labels  : [ 48. 105.  60. ...   0.   1.   6.]

Results on test set:  613 correct out of 1258 total.


#### Only 45196 out of 76835 we classified correctly so we know that the k value we used is really bad (not optimal)

## Use Cross Validation to find the Best K Value:

In [140]:
# cross-validation splits the training set into two pieces:
#   + model-building and model-validation. We'll use "build" and "validate"

from sklearn.model_selection import cross_val_score

best_k = 84 
best_accuracy = 0.0 

# Note: only cross-validate using the TEST data
for k in range(1,40):
    knn_cv_model = KNeighborsClassifier(n_neighbors=k)   # build knn_model for every possible k
    cv_scores = cross_val_score( knn_cv_model, X_train, y_train, cv=5 )  # cv=5 means 80/20
    average_cv_accuracy = cv_scores.mean() 
    if average_cv_accuracy > best_accuracy:
        best_accuracy = average_cv_accuracy
        best_k = k
    print(f"k: {k:2d}  cv accuracy: {average_cv_accuracy:7.4f}")


print(f"best_k = {best_k}   yields the highest average cv accuracy of {best_accuracy}.")  # print the best one
  



k:  1  cv accuracy:  0.3942
k:  2  cv accuracy:  0.4555




k:  3  cv accuracy:  0.4616
k:  4  cv accuracy:  0.4630




k:  5  cv accuracy:  0.4670
k:  6  cv accuracy:  0.4698




k:  7  cv accuracy:  0.4722
k:  8  cv accuracy:  0.4753




k:  9  cv accuracy:  0.4757
k: 10  cv accuracy:  0.4751




k: 11  cv accuracy:  0.4751
k: 12  cv accuracy:  0.4751




k: 13  cv accuracy:  0.4755
k: 14  cv accuracy:  0.4757




k: 15  cv accuracy:  0.4761




k: 16  cv accuracy:  0.4751




k: 17  cv accuracy:  0.4751




k: 18  cv accuracy:  0.4761




k: 19  cv accuracy:  0.4761




k: 20  cv accuracy:  0.4761




k: 21  cv accuracy:  0.4761




k: 22  cv accuracy:  0.4763




k: 23  cv accuracy:  0.4763




k: 24  cv accuracy:  0.4761




k: 25  cv accuracy:  0.4761




k: 26  cv accuracy:  0.4761




k: 27  cv accuracy:  0.4761




k: 28  cv accuracy:  0.4761




k: 29  cv accuracy:  0.4761




k: 30  cv accuracy:  0.4761




k: 31  cv accuracy:  0.4761




k: 32  cv accuracy:  0.4761




k: 33  cv accuracy:  0.4761




k: 34  cv accuracy:  0.4761




k: 35  cv accuracy:  0.4761




k: 36  cv accuracy:  0.4761




k: 37  cv accuracy:  0.4761




k: 38  cv accuracy:  0.4761




k: 39  cv accuracy:  0.4761
best_k = 22   yields the highest average cv accuracy of 0.47634194831013915.


In [141]:
## Cross Validation Part 2

from sklearn.model_selection import cross_val_score

best_k = 1
best_accuracy = 0.0 

# Note: only cross-validate using the TEST data
for k in range(10,21):
    knn_cv_model = KNeighborsClassifier(n_neighbors=k)   # build knn_model for every possible k
    cv_scores = cross_val_score( knn_cv_model, X_train, y_train, cv=5 )  # cv=5 means 80/20
    average_cv_accuracy = cv_scores.mean() 
    if average_cv_accuracy > best_accuracy:
        best_accuracy = average_cv_accuracy
        best_k = k
    print(f"k: {k:2d}  cv accuracy: {average_cv_accuracy:7.4f}")


print(f"best_k = {best_k}   yields the highest average cv accuracy of {best_accuracy}.")  # print the best one
  



k: 10  cv accuracy:  0.4751
k: 11  cv accuracy:  0.4751




k: 12  cv accuracy:  0.4751




k: 13  cv accuracy:  0.4755




k: 14  cv accuracy:  0.4757




k: 15  cv accuracy:  0.4761




k: 16  cv accuracy:  0.4751




k: 17  cv accuracy:  0.4751




k: 18  cv accuracy:  0.4761




k: 19  cv accuracy:  0.4761




k: 20  cv accuracy:  0.4761
best_k = 15   yields the highest average cv accuracy of 0.4761431411530815.


## Use the best K value to build a Tuned Predictive Model 

In [142]:
# Use best k to build and train a new model:

best_k = 1
from sklearn.neighbors import KNeighborsClassifier
knn_model_tuned = KNeighborsClassifier(n_neighbors=best_k)   # here, we use the best_k!

# we train the model (one line!)
knn_model_tuned.fit(X_train, y_train)                              # yay!  trained!
print(f"Created + trained a knn classifier, now tuned with a (best) k of {best_k}")  

Created + trained a knn classifier, now tuned with a (best) k of 1


In [143]:
# Test the tuned model on the test data 
predicted_labels = knn_model_tuned.predict(X_test)
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual labels:", actual_labels)

# And, the overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total.\n\n")

Predicted labels: [439.   0.   3. ...   0.  18.   0.]
Actual labels: [ 48. 105.  60. ...   0.   1.   6.]

Results on test set:  511 correct out of 1258 total.




#### 53152 out of 76835 are correct, this is better than before but still not great. Future work includes making changes to the data set and model to increase the model accuracy 

## Create the Final Model: 

In [144]:
# Use ALL available data to train the final predictive model:

knn_model_final = KNeighborsClassifier(n_neighbors=best_k)   # here, we use the best_k
knn_model_final.fit(X_all, y_all)                              # here we use ALL the data!
print(f"Created + trained a 'final' knn classifier, with a (best) k of {best_k}") 



Created + trained a 'final' knn classifier, with a (best) k of 1


In [145]:
# Create a pickle file of the model
pickle.dump(knn_model_final, open("model.pkl", "wb"))

In [146]:
# final predictive model (k-nearest-neighbor), with tuned k + ALL data incorporated

def predictive_model( Features, Model ):
    """ input: a list of four features 
        output: the predicted consumption bucket 
    """
    our_features = np.asarray([Features])                

    predicted_species = Model.predict(our_features)
    
    predicted_species = int(round(predicted_species[0]))  
    return predicted_species


In [147]:
# pass a list of features into the model and get the output

Features = [0, 0.19, 0.00, 470, 13.2, 93.5, 60.6, 76.0, 76, 21, 43, 52.0, 3.1, 75.2, 73.2, 0, 0]

prediction = predictive_model(Features, knn_model_final)
print(f" The model predicts {prediction} from the features {Features}")

 The model predicts 0 from the features [0, 0.19, 0.0, 470, 13.2, 93.5, 60.6, 76.0, 76, 21, 43, 52.0, 3.1, 75.2, 73.2, 0, 0]
