In [1]:
# imports basics
import pandas as pd
import numpy as np

# ml
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer

# models
from sklearn.neighbors import KNeighborsClassifier

# variables
train_path = '/kaggle/input/playground-series-s4e10/train.csv'
test_path = '/kaggle/input/playground-series-s4e10/test.csv'
subm_path = '/kaggle/input/playground-series-s4e10/sample_submission.csv'
original_path = '/kaggle/input/total-dataset-wonans/total_data.csv'

seed = 1
kf = KFold(n_splits=2)
kfs = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

clf = KNeighborsClassifier()
#clf2 = FaissKNeighborsRegressor(n_neighbors=250, weights=lambda x:x**-p)

In [2]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path).drop('id', axis=1)
df_subm = pd.read_csv(subm_path)
original = pd.read_csv(original_path)
original.tail()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
91221,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0
91222,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0
91223,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1
91224,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.1,N,26,0
91225,66,42000,RENT,2.0,MEDICAL,B,6475,9.99,0.15,N,30,0


In [3]:
df_test.tail()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
39093,22,31200,MORTGAGE,2.0,DEBTCONSOLIDATION,B,3000,10.37,0.1,N,4
39094,22,48000,MORTGAGE,6.0,EDUCATION,A,7000,6.03,0.15,N,3
39095,51,60000,MORTGAGE,0.0,PERSONAL,A,15000,7.51,0.25,N,25
39096,22,36000,MORTGAGE,4.0,PERSONAL,D,14000,15.62,0.39,Y,4
39097,31,45000,RENT,6.0,DEBTCONSOLIDATION,B,19450,9.91,0.44,N,9


In [4]:
original.tail()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
91221,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0
91222,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0
91223,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1
91224,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.1,N,26,0
91225,66,42000,RENT,2.0,MEDICAL,B,6475,9.99,0.15,N,30,0


In [5]:
# df_train_total = pd.concat([original, df_train])
# df_train_total = df_train_total.fillna(0).drop('id', axis=1)
# df_train_total

In [6]:
#X = df_train_total.iloc[:, 1:-1] # features
#y = df_train_total.iloc[:, -1] # target

X = original.drop('loan_status', axis=1)
y = original['loan_status']

# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=seed)

In [7]:
pipe = Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore')), ('estimator', KNeighborsClassifier())]) # pipe
pipe

**what happens if you change the cross-val model?**

In [9]:
# k-fold cv
scores = cross_val_score(
    pipe, X_train, y_train, cv=kfs, scoring='roc_auc', verbose=10)

[CV] START .....................................................................
[CV] END ................................ score: (test=0.819) total time=  32.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   32.8s


[CV] END ................................ score: (test=0.832) total time=  29.9s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.848) total time=  30.0s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.830) total time=  29.3s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 tasks      | elapsed:  2.0min


[CV] END ................................ score: (test=0.826) total time=  29.9s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.829) total time=  30.0s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.841) total time=  30.1s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   7 tasks      | elapsed:  3.5min


[CV] END ................................ score: (test=0.832) total time=  29.7s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.837) total time=  28.9s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.829) total time=  30.0s


**hpo**

In [11]:
## gridsearch 
knn_params = {
    'estimator__n_neighbors':[250, 500]
}

random_search = RandomizedSearchCV(
    pipe, param_distributions=knn_params, random_state=seed, cv=kfs, scoring='roc_auc', verbose=100, n_iter=3)

random_search.fit(X_train, y_train)



Fitting 10 folds for each of 2 candidates, totalling 20 fits
[CV 1/10; 1/2] START estimator__n_neighbors=250.................................
[CV 1/10; 1/2] END ..estimator__n_neighbors=250;, score=0.872 total time=  30.7s
[CV 2/10; 1/2] START estimator__n_neighbors=250.................................
[CV 2/10; 1/2] END ..estimator__n_neighbors=250;, score=0.875 total time=  31.1s
[CV 3/10; 1/2] START estimator__n_neighbors=250.................................
[CV 3/10; 1/2] END ..estimator__n_neighbors=250;, score=0.890 total time=  29.6s
[CV 4/10; 1/2] START estimator__n_neighbors=250.................................
[CV 4/10; 1/2] END ..estimator__n_neighbors=250;, score=0.872 total time=  30.3s
[CV 5/10; 1/2] START estimator__n_neighbors=250.................................
[CV 5/10; 1/2] END ..estimator__n_neighbors=250;, score=0.878 total time=  31.8s
[CV 6/10; 1/2] START estimator__n_neighbors=250.................................
[CV 6/10; 1/2] END ..estimator__n_neighbors=250;

In [13]:
print(random_search.best_score_)
print('###')
print(random_search.best_estimator_)
print('###')
print(random_search.best_params_)

0.8785974007531665
###
Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('estimator', KNeighborsClassifier(n_neighbors=250))])
###
{'estimator__n_neighbors': 250}


**predictions**

In [14]:
y_hat  = random_search.predict_proba(df_test)[:, 1]
y_hat

array([0.412, 0.08 , 0.36 , ..., 0.016, 0.396, 0.232])

In [17]:
df_subm['loan_status'] = y_hat
df_subm.to_csv('knn2.1TotalData.csv', index=False)
df_subm.tail()

Unnamed: 0,id,loan_status
39093,97738,0.06
39094,97739,0.028
39095,97740,0.016
39096,97741,0.396
39097,97742,0.232
