In [381]:
import seaborn as sns
import numpy as np
import pandas as pd
import altair as alt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, log_loss

In [382]:
df = sns.load_dataset("taxis")
df.dropna(inplace=True)

In [383]:
df.columns

Index(['pickup', 'dropoff', 'passengers', 'distance', 'fare', 'tip', 'tolls',
       'total', 'color', 'payment', 'pickup_zone', 'dropoff_zone',
       'pickup_borough', 'dropoff_borough'],
      dtype='object')

In [384]:
df["hour"] = pd.to_datetime(df.pickup).dt.hour

In [385]:
df["minute"] = pd.to_datetime(df.pickup).dt.minute

In [386]:
df["day"] = pd.to_datetime(df.pickup).dt.day_name()

In [387]:
df["weekend"] = pd.to_datetime(df.pickup).dt.day_name().isin(["Sunday","Saturday"])

In [388]:
scaler = StandardScaler()

In [389]:
cols = ["fare","distance","tip","hour","minute","weekend"]

In [404]:
df[cols] = scaler.fit_transform(df[cols])

In [405]:
X_train, X_test, y_train, y_test = train_test_split(
    df[cols], df["payment"], test_size = 0.2)

In [406]:
def get_scores(k):
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train, y_train)
    train_error = log_loss(y_train, clf.predict_proba(X_train),labels=sorted(y_train.unique()))
    test_error = log_loss(y_test, clf.predict_proba(X_test),labels=sorted(y_train.unique()))
    return (train_error, test_error)

In [407]:
get_scores(1)

(9.992007221626413e-16, 3.102774238786303)

In [408]:
get_scores(2)

(0.0684674166917463, 1.6245799658825395)

In [409]:
get_scores(len(X_train))

(0.595512045669738, 0.5971386847053295)

In [410]:
get_scores(90)

(0.24189282568196083, 0.23265047885023055)

In [411]:
df_scores = pd.DataFrame({"k":list(range(6,100,5)),"train_error":np.nan,"test_error":np.nan})

In [412]:
df_scores["kinv"] = 1/df_scores.k

In [413]:
for i in df_scores.index:
    df_scores.loc[i,["train_error","test_error"]] = get_scores(df_scores.loc[i,"k"])

In [414]:
ctrain = alt.Chart(df_scores).mark_line().encode(
    x = "kinv",
    y = "train_error"
)

In [415]:
ctest = alt.Chart(df_scores).mark_line(color="orange").encode(
    x = "kinv",
    y = "test_error"
)

In [416]:
ctrain+ctest

In [417]:
df_scores.test_error.min()

0.18454047213544372