In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

# This is for the plots not being cut off
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})



### Make a Prediction with Scikit-Learn
Steps:
1. Load data
1. Normalize
1. Train kNN classifier with training set
1. Test kNN classifier on test instance

In [2]:
def predict(df, predicting_attribute, test_data ):
    X_train = df.drop(predicting_attribute,axis=1)
    y_train = df[predicting_attribute]

    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train_normalized = scaler.transform(X_train) # often combined into one step, using fit_transform()


    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train_normalized, y_train)

    X_test = pd.Series(test_data, index=df.columns.drop(predicting_attribute))
    X_test = scaler.transform([X_test])
    y_test_prediction = neigh.predict(X_test)

    return(y_test_prediction)

In [3]:
# return the date of the week of a given date in the format of 'YYYY-MM-DD'
def get_weekday(date):
    return pd.to_datetime(date).strftime('%A')

audio_df = pd.read_csv("data/cleaned_puryear_audio.csv",index_col=0)
movement_df = pd.read_csv("data/cleaned_puryear_movement.csv",index_col=0)
sleep_df = pd.read_csv("data/cleaned_puryear_sleep.csv",index_col=0)

In [4]:
def return_dataframe_with_weekday(df):
    df = df.reset_index()
    weekday_df = pd.DataFrame()
    weekday_df['Weekday'] = df['Date'].apply(get_weekday)
    df = df.join(weekday_df)
    df.set_index("Date",inplace=True)

    return df

weekday_audio_df = return_dataframe_with_weekday(audio_df)
weekday_movement_df = return_dataframe_with_weekday(movement_df)
weekday_sleep_df = return_dataframe_with_weekday(sleep_df)


### Objective of kNN
* My goal for the main kNN function is to predict the day of the week when given all the data

In [5]:
sleep_and_audio_data = pd.merge(sleep_df,audio_df,on="Date")
all_data = pd.merge(sleep_and_audio_data,weekday_movement_df, on="Date")

In [6]:
def get_weekend_yes_no_df(df):
    df.replace("Monday","Weekday",inplace=True)
    df.replace("Tuesday","Weekday",inplace=True)
    df.replace("Wednesday","Weekday",inplace=True)
    df.replace("Thursday","Weekday",inplace=True)
    df.replace("Friday","Weekday",inplace=True)
    df.replace("Saturday","Weekend",inplace=True)
    df.replace("Sunday","Weekend",inplace=True)

    return df

In [7]:
all_data = get_weekend_yes_no_df(all_data)
print(all_data["Weekday"])

Date
2020-11-22    Weekend
2020-11-23    Weekday
2020-11-24    Weekday
2020-11-25    Weekday
2020-11-26    Weekday
               ...   
2021-11-17    Weekday
2021-11-18    Weekday
2021-11-19    Weekday
2021-11-20    Weekend
2021-11-21    Weekend
Name: Weekday, Length: 338, dtype: object


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = all_data.drop(columns="Weekday",axis=1)
y = all_data["Weekday"]

scaler= MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0,stratify=y)

In [9]:
# This block will calculate the kNN classifier for each n_neighbors between 1 and 224 and see which is the best
best_n = 0
best_n_val = 0
for i in range(1,224):
    knn_clf = KNeighborsClassifier(n_neighbors=i, metric="euclidean")
    knn_clf.fit(X_train,y_train)
    y_predicted = knn_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_predicted)

    if accuracy > best_n_val:
        best_n = i
        best_n_val = accuracy

print("The best accuracy was", best_n_val,"with an n-val of",best_n)

The best accuracy was 0.7411764705882353 with an n-val of 27


### Lets make it a function

In [10]:
def get_kNN_prediction_accuracy(df,column):
    X = df.drop(columns=column,axis=1)
    y = df[column]

    scaler= MinMaxScaler()
    X = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0,stratify=y)

    # This block will calculate the kNN classifier for each n_neighbors between 1 and 224 and see which is the best
    best_n = 0
    best_n_val = 0
    for i in range(1,224):
        knn_clf = KNeighborsClassifier(n_neighbors=i, metric="euclidean")
        knn_clf.fit(X_train,y_train)
        y_predicted = knn_clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_predicted)

        if accuracy > best_n_val:
            best_n = i
            best_n_val = accuracy
            
    return best_n_val, best_n

In [11]:
audio_accuracy, audio_n = get_kNN_prediction_accuracy(get_weekend_yes_no_df(weekday_audio_df),"Weekday")
print("The accuracy of the kNN for audio was",audio_accuracy,"when the n_neighbors was",audio_n)

sleep_accuracy, sleep_n = get_kNN_prediction_accuracy(get_weekend_yes_no_df(weekday_sleep_df),"Weekday")
print("The accuracy of the kNN for sleep was",sleep_accuracy,"when the n_neighbors was",sleep_n)

movement_accuracy, movement_n = get_kNN_prediction_accuracy(get_weekend_yes_no_df(weekday_movement_df),"Weekday")
print("The accuracy of the kNN for movement was",movement_accuracy,"when the n_neighbors was",movement_n)

The accuracy of the kNN for audio was 0.7252747252747253 when the n_neighbors was 33
The accuracy of the kNN for sleep was 0.7176470588235294 when the n_neighbors was 7
The accuracy of the kNN for movement was 0.7391304347826086 when the n_neighbors was 9


## Results of kNN
* Based on the results we can see that there is a slight corelation between the data and if it is the weekend or not!
    * There is an accuracy of 0.74 on predicting if the data is on a weekday or weekend
* We can also see that sleep/movement and the day of the week are slightly correlated as well
