In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
d2024 = pd.read_csv("pppub24.csv")
d2023 = pd.read_csv("pppub23.csv")
d2022 = pd.read_csv("pppub22.csv")
d2021 = pd.read_csv("pppub21.csv")

data = pd.concat([d2024, d2023, d2022, d2021], ignore_index=True)
data = data.sample(frac=0.1, random_state=42)

rfe_features100 = data[['HHDFMX', 'POTHVAL', 'RESNSS1', 'RESNSS2', 'RESNSSI1', 'RESNSSI2', 'RETCB_VAL', 'RETCB_YN', 'RINT_SC1', 'RINT_SC2', 'RINT_VAL1', 'RINT_VAL2', 'RINT_YN', 'RNT_VAL', 'RNT_YN', 'SRVS_VAL', 'SS_VAL', 'SS_YN', 'SSI_VAL', 'SSI_YN', 'STRKUC', 'SUBUC', 'SUR_SC1', 'SUR_SC2', 'SUR_VAL1', 'SUR_VAL2', 'SUR_YN', 'TRDINT_VAL', 'TSURVAL1', 'TSURVAL2', 'UC_VAL', 'UC_YN', 'VET_QVA', 'VET_TYP1', 'VET_TYP2', 'VET_TYP3', 'VET_TYP4', 'VET_TYP5', 'VET_VAL', 'VET_YN', 'WC_TYPE', 'WC_VAL', 'WC_YN', 'PAW_MON', 'PAW_TYP', 'PAW_VAL', 'PAW_YN', 'PENINCL', 'PENPLAN', 'WICYN', 'CHCARE_YN', 'CHELSEW_YN', 'CHSP_VAL', 'CHSP_YN', 'CSP_VAL', 'CSP_YN', 'ACTC_CRD', 'CTC_CRD', 'DEP_STAT', 'EIT_CRED', 'FEDTAX_AC', 'FEDTAX_BC', 'FICA', 'FILESTAT', 'MARG_TAX', 'PRSWKXPNS', 'STATETAX_A', 'STATETAX_B', 'PERLIS', 'POV_UNIV', 'COV', 'COV_CYR', 'COV_MULT_CYR', 'NOCOV_CYR', 'NOW_COV', 'NOW_PUB', 'PUB', 'PUB_CYR', 'NOW_DEPPRIV', 'NOW_OUTPRIV', 'NOW_OWNPRIV', 'NOW_PRIV', 'OUTPRIV', 'OWNPRIV', 'PRIV', 'PRIV_CYR', 'DEPGRP', 'GRPFTYP', 'GRPFTYP2', 'MRK', 'NOW_CAID', 'NOW_PCHIP', 'NOW_MCARE', 'NOW_IHSFLG', 'NOW_VACARE', 'PECOULD', 'ESIOFFER']]
rfe_features50 = data[['CHCARE_YN', 'CHELSEW_YN', 'CHSP_VAL', 'CHSP_YN', 'CSP_VAL', 'CSP_YN', 'ACTC_CRD', 'CTC_CRD', 'DEP_STAT', 'EIT_CRED', 'FEDTAX_AC', 'FEDTAX_BC', 'FICA', 'FILESTAT', 'MARG_TAX', 'PRSWKXPNS', 'STATETAX_A', 'STATETAX_B', 'PERLIS', 'POV_UNIV', 'COV', 'COV_CYR', 'COV_MULT_CYR', 'NOCOV_CYR', 'NOW_COV', 'NOW_PUB', 'PUB', 'DEPPRIV', 'NOW_DEPPRIV', 'NOW_OUTPRIV', 'NOW_OWNPRIV', 'NOW_PRIV', 'OUTPRIV', 'OWNPRIV', 'PRIV', 'PRIV_CYR', 'GRP', 'GRPFTYP', 'GRPFTYP2', 'NOW_MCAID', 'NOW_CAID', 'NOW_PCHIP', 'NOW_MCARE', 'NOW_IHSFLG', 'ESICOULD']]
rfe_features20 = data[['CSP_VAL', 'CSP_YN', 'FEDTAX_AC', 'FEDTAX_BC', 'FICA', 'FILESTAT', 'PRSWKXPNS', 'STATETAX_A', 'STATETAX_B', 'PERLIS', 'COV_MULT_CYR', 'DEPPRIV', 'NOW_OWNPRIV', 'OWNPRIV', 'GRPFTYP2', 'NOW_MCARE']]

In [3]:
X = rfe_features100
y = data['PTOT_R']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', KNeighborsClassifier(n_neighbors=5))
])

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Training accuracy:", clf.score(X_train, y_train))
print("Test accuracy:", clf.score(X_test, y_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

#precision - percent of correct predictions
#recall - percent of samples of class that were correct
#f1 - balance between precision and recall

Training accuracy: 0.8014176046719729
Test accuracy: 0.7163892093840998

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3699
           1       0.94      0.99      0.96      2477
           2       0.70      0.65      0.68       274
           3       0.70      0.70      0.70       325
           4       0.70      0.70      0.70       322
           5       0.72      0.80      0.76       556
           6       0.73      0.71      0.72       395
           7       0.72      0.71      0.72       491
           8       0.62      0.65      0.63       354
           9       0.61      0.73      0.67       505
          10       0.56      0.53      0.54       356
          11       0.59      0.62      0.61       508
          12       0.46      0.45      0.46       295
          13       0.52      0.61      0.56       535
          14       0.40      0.38      0.39       215
          15       0.55      0.56      

In [None]:
X = rfe_features100
y = data['PTOT_R']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

K = []
training = []
test = []
scores = {}

for k in range(2, 15):
    clf = KNeighborsClassifier(n_neighbors = k)
    clf.fit(X_train, y_train)

    training_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    K.append(k)

    training.append(training_score)
    test.append(test_score)
    scores[k] = [training_score, test_score]

for keys, values in scores.items():
    print(keys, ':', values)

# 20 features - 5 min
#2 : [0.7403098949748034, 0.5978242953683863]
#3 : [0.7080252437243912, 0.608922586671062]
#4 : [0.6908585692083078, 0.6129882973462997]
#5 : [0.6815570103141336, 0.6137574858524257]
#6 : [0.6753402722177743, 0.6135377177078183]
#7 : [0.6708190081476946, 0.6125487610570848]
#8 : [0.6648377525549851, 0.6150211526839184]
#9 : [0.6607403569914755, 0.6154057469369815]
# 100 features - 10 min
#2 : [0.8579098572976028, 0.7014999175869457]
#3 : [0.8240710215231009, 0.709246744684358]
#4 : [0.8095417510478972, 0.7134772814680512]
#5 : [0.8014176046719729, 0.7163892093840998]
#6 : [0.7907737955069938, 0.7151804845887588]
#7 : [0.7850044741675694, 0.7135322235042031]
#8 : [0.7795412800828898, 0.713587165540355]
#9 : [0.7723826119719305, 0.7118290203834954]
#10 : [0.7683323129091508, 0.7116641942750398]
#11 : [0.7644703998492912, 0.7117740783473435]
#12 : [0.7606555832901616, 0.7093566287566617]
#13 : [0.756558187726652, 0.7085324982143838]
#14 : [0.7527904676682522, 0.7074885995274985]

2 : [0.8579098572976028, 0.7014999175869457]
3 : [0.8240710215231009, 0.709246744684358]
4 : [0.8095417510478972, 0.7134772814680512]
5 : [0.8014176046719729, 0.7163892093840998]
6 : [0.7907737955069938, 0.7151804845887588]
7 : [0.7850044741675694, 0.7135322235042031]
8 : [0.7795412800828898, 0.713587165540355]
9 : [0.7723826119719305, 0.7118290203834954]
10 : [0.7683323129091508, 0.7116641942750398]
11 : [0.7644703998492912, 0.7117740783473435]
12 : [0.7606555832901616, 0.7093566287566617]
13 : [0.756558187726652, 0.7085324982143838]
14 : [0.7527904676682522, 0.7074885995274985]
