In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

# This is for the plots not being cut off
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})



### Make a Prediction with Scikit-Learn
Steps:
1. Load data
1. Normalize
1. Train kNN classifier with training set
1. Test kNN classifier on test instance

In [2]:
def predict(df, predicting_attribute, test_data ):
    X_train = df.drop(predicting_attribute,axis=1)
    y_train = df[predicting_attribute]

    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train_normalized = scaler.transform(X_train) # often combined into one step, using fit_transform()


    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train_normalized, y_train)

    X_test = pd.Series(test_data, index=df.columns.drop(predicting_attribute))
    X_test = scaler.transform([X_test])
    y_test_prediction = neigh.predict(X_test)

    return(y_test_prediction)

data = [[7, 7, "Bad"], [7, 4, "Bad"], [3, 4, "Good"], [1, 4, "Good"]]
df = pd.DataFrame(data, columns=["Acid durability (seconds)", "Strength (kg/square meter)", "Classification"])

result1 = predict(df, "Classification",pd.Series([1,7], index=df.columns.drop("Classification")))
result2 = predict(df, "Classification",pd.Series([9,7], index=df.columns.drop("Classification")))

print(result1)
print(result2)

['Good']
['Bad']


In [3]:
# return the date of the week of a given date in the format of 'YYYY-MM-DD'
def get_weekday(date):
    return pd.to_datetime(date).strftime('%A')

audio_df = pd.read_csv("data/cleaned_puryear_audio.csv",index_col=0)
movement_df = pd.read_csv("data/cleaned_puryear_movement.csv",index_col=0)
sleep_df = pd.read_csv("data/cleaned_puryear_sleep.csv",index_col=0)

In [4]:
def return_dataframe_with_weekday(df):
    df = df.reset_index()
    weekday_df = pd.DataFrame()
    weekday_df['Weekday'] = df['Date'].apply(get_weekday)
    df = df.join(weekday_df)
    df.set_index("Date",inplace=True)

    return df

weekday_audio_df = return_dataframe_with_weekday(audio_df)
weekday_movement_df = return_dataframe_with_weekday(movement_df)
weekday_sleep_df = return_dataframe_with_weekday(sleep_df)
print(weekday_audio_df)


            Headphone sound levels(dBASPL)    Weekday
Date                                                 
2020-11-22                          66.064     Sunday
2020-11-23                          61.650     Monday
2020-11-24                          54.927    Tuesday
2020-11-25                          57.506  Wednesday
2020-11-26                          39.405   Thursday
...                                    ...        ...
2021-11-17                          65.809  Wednesday
2021-11-18                          62.816   Thursday
2021-11-19                          57.094     Friday
2021-11-20                          59.618   Saturday
2021-11-21                          60.938     Sunday

[363 rows x 2 columns]


In [5]:
total = 0
correct = 0
for i in weekday_audio_df.index:
    result = predict(weekday_audio_df, "Weekday",pd.Series(weekday_audio_df["Headphone sound levels(dBASPL)"][i], index=weekday_audio_df.columns.drop("Weekday")))
    if(result == weekday_audio_df["Weekday"][i]):
        correct += 1
    total += 1

print(correct / total)


0.45179063360881544


In [6]:
total = 0
correct = 0
for i in weekday_sleep_df.index:
    result = predict(weekday_sleep_df, "Weekday",pd.Series(weekday_sleep_df["Time in bed(hr)"][i], index=weekday_sleep_df.columns.drop("Weekday")))
    if(result == weekday_sleep_df["Weekday"][i]):
        correct += 1
    total += 1

print(correct / total)

0.49169435215946844
