In [3]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, LogisticRegression, ElasticNet, LinearRegression
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, KFold
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss, roc_auc_score, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.decomposition import PCA
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, VotingRegressor

import warnings
warnings.filterwarnings('ignore')

In [5]:
from ISLP import load_data
credit = load_data('Credit')
credit.drop("ID", axis = 1, inplace = True)


In [7]:
credit.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331


In [9]:
x = credit.drop("Balance", axis=1)
y = credit["Balance"]

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size = 0.3, 
                                                   random_state = 24)

In [68]:
# ______________________________________________________________________________________
# Pipeline

# one hot encoder
ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False,
    drop='first'
).set_output(transform='pandas')

trans_ohe = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude='category')),
    (ohe, make_column_selector(dtype_include='category')),
    verbose_feature_names_out=False
).set_output(transform='pandas')


# scaler
scl_std = StandardScaler().set_output(transform = "pandas")
scl_mm = MinMaxScaler().set_output(transform = "pandas")


# Model

dtc = DecisionTreeRegressor()
knn = KNeighborsRegressor()
eln = ElasticNet()

vor = VotingRegressor([("DTC", dtc), ("KNN", knn), ("ELN", eln)])


pipe = Pipeline([("OHE", trans_ohe), ("SCL", scl_mm), ("VOR", vor)])


# _____________________________________________________________________________________
# GCV

params = {
    'VOR__DTC__max_depth' : [None, 3, 5],
    'VOR__DTC__min_samples_leaf' : [1, 5, 10],
    'VOR__DTC__min_samples_split' : [2, 5, 10],
     # 'VOR__KNN__algorithm' : ['auto', 'brute'],
     # 'VOR__KNN__metric' : ['cosine', 'l2', 'l1', 'manhattan', 'euclidean'],
     'VOR__KNN__n_neighbors' : [2, 5, 7], 
     'VOR__ELN__alpha' : np.linspace(0.001, 3, 3),
     'VOR__ELN__l1_ratio' : np.linspace(0.001, 1, 3),
    'VOR__weights' : [[6, 3, 6], [7, 4, 7], [9, 2, 9]]
}

kfolds = StratifiedKFold(n_splits = 5,
                        random_state = 24,
                        shuffle = True)

kfold = KFold(n_splits = 5,
            random_state = 24,
            shuffle = True)

gcv = GridSearchCV(pipe,
                  param_grid = params,
                  scoring = "r2",
                  cv = kfold,
                  verbose = 3)


# gcv.fit(x, y)

In [70]:
gcv.fit(x_train, y_train)

Fitting 5 folds for each of 2187 candidates, totalling 10935 fits
[CV 1/5] END VOR__DTC__max_depth=None, VOR__DTC__min_samples_leaf=1, VOR__DTC__min_samples_split=2, VOR__ELN__alpha=0.001, VOR__ELN__l1_ratio=0.001, VOR__KNN__n_neighbors=2, VOR__weights=[6, 3, 6];, score=0.874 total time=   0.0s
[CV 2/5] END VOR__DTC__max_depth=None, VOR__DTC__min_samples_leaf=1, VOR__DTC__min_samples_split=2, VOR__ELN__alpha=0.001, VOR__ELN__l1_ratio=0.001, VOR__KNN__n_neighbors=2, VOR__weights=[6, 3, 6];, score=0.930 total time=   0.0s
[CV 3/5] END VOR__DTC__max_depth=None, VOR__DTC__min_samples_leaf=1, VOR__DTC__min_samples_split=2, VOR__ELN__alpha=0.001, VOR__ELN__l1_ratio=0.001, VOR__KNN__n_neighbors=2, VOR__weights=[6, 3, 6];, score=0.953 total time=   0.0s
[CV 4/5] END VOR__DTC__max_depth=None, VOR__DTC__min_samples_leaf=1, VOR__DTC__min_samples_split=2, VOR__ELN__alpha=0.001, VOR__ELN__l1_ratio=0.001, VOR__KNN__n_neighbors=2, VOR__weights=[6, 3, 6];, score=0.937 total time=   0.0s
[CV 5/5] END V

In [72]:
print(gcv.best_score_)
print(gcv.best_params_)

0.9514012411309487
{'VOR__DTC__max_depth': None, 'VOR__DTC__min_samples_leaf': 1, 'VOR__DTC__min_samples_split': 2, 'VOR__ELN__alpha': 0.001, 'VOR__ELN__l1_ratio': 1.0, 'VOR__KNN__n_neighbors': 2, 'VOR__weights': [9, 2, 9]}


In [76]:
y_pred = gcv.predict(x_test)
r2_score(y_test, y_pred)

0.9388293693955004

In [24]:
ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False,
    drop='first'
).set_output(transform='pandas')

trans_ohe = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude='category')),
    (ohe, make_column_selector(dtype_include='category')),
    verbose_feature_names_out=False
).set_output(transform='pandas')

dtc = DecisionTreeRegressor()
knn = KNeighborsRegressor()
eln = ElasticNet()

pipe_dtc = Pipeline([("OHE", trans_ohe), ("DTC", dtc)])
pipe_knn = Pipeline([("OHE", trans_ohe), ("KNN", knn)])
pipe_eln = Pipeline([("OHE", trans_ohe), ("ELN", eln)])

voting = VotingRegressor([("PIPE_DTC", pipe_dtc), ("PIPE_KNN", pipe_knn), ("PIPE_ELN", pipe_eln)])



In [26]:
voting.fit(x_train, y_train)

In [32]:
y_pred = voting.predict(x_test)
r2_score(y_test, y_pred)

0.9115187340406501

In [36]:
pipe_dtc.fit(x_train, y_train)
y_pred_dtc = pipe_dtc.predict(x_test)
r2_score(y_test, y_pred_dtc)

0.8988895126911057

In [38]:
pipe_knn.fit(x_train, y_train)
y_pred_knn = pipe_knn.predict(x_test)
r2_score(y_test, y_pred_knn)

0.7899310404076199

In [40]:
pipe_eln.fit(x_train, y_train)
y_pred_eln = pipe_eln.predict(x_test)
r2_score(y_test, y_pred_eln)

0.8940622886507

In [52]:
voting = VotingRegressor([("PIPE_DTC", pipe_dtc), ("PIPE_KNN", pipe_knn), ("PIPE_ELN", pipe_eln)],
                        weights = [6, 2, 6])
voting.fit(x_train, y_train)
y_pred = voting.predict(x_test)
r2_score(y_test, y_pred)

0.9286833784569565