In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# import library

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')

import warnings
warnings.filterwarnings('ignore')
import joblib

from scipy import stats

In [None]:
path = '/content/drive/MyDrive/dacon/2023.11.23/dataset'

orig = pd.read_csv(path + '/train.csv')
train = pd.read_csv(path + '/train.csv')
test = pd.read_csv(path + '/test.csv')

In [None]:
# 데이터 정보

train.info()

In [None]:
# 기술 통계량 보기

train.describe().style.background_gradient(cmap = 'summer_r')

데이터 정리

 0   user_id                            10000 non-null  object -> 식별자

 1   subscription_duration              10000 non-null  int64 -> 월

 2   recent_login_time                  10000 non-null  int64 -> 일

 3   average_login_time                 10000 non-null  float64 -> 로그인 시간 (연속)

 4   average_time_per_learning_session  10000 non-null  float64 -> 분 (연속)

 5   monthly_active_learning_days       10000 non-null  int64 -> 일수

 6   total_completed_courses            10000 non-null  int64 -> 코스의 수

 7   recent_learning_achievement        10000 non-null  float64 -> 성취도 (연속)

 8   abandoned_learning_sessions        10000 non-null  int64 -> 중단된 코스의 수

 9   community_engagement_level         10000 non-null  int64 -> 커뮤니티 참여도

 10  preferred_difficulty_level         10000 non-null  object -> 범주형

 11  subscription_type                  10000 non-null  object -> 범주형

 12  customer_inquiry_history           10000 non-null  int64 -> 고객 문의 이력

 13  payment_pattern                    10000 non-null  int64 -> 3개월간 패턴을 10진수로 표현한 값

 14  target                             10000 non-null  int64 -> 종속변수

## EDA

### 이상치

In [None]:
# 데이터 나누기

float_data = train.select_dtypes('float')
int_data = train.select_dtypes('int')
obj_data = train.select_dtypes('object')

In [None]:
y = train['target']

In [None]:
# 연속형 변수

fig , ax = plt.subplots(1 , 3 , figsize = (13 , 5))

for i in range(3):
    sns.histplot(
        data = float_data ,
        x = float_data.columns[i] ,
        kde = True ,
        ax = ax[i] ,
        hue = y
    )

    ax[i].set_title(f"{float_data.columns[i]} Distribution" , fontsize = 10)

# plt.savefig('float_plt.png')

In [None]:
# 왜도 계산

for col in float_data.columns:
    print(f"{col} Skew : {float_data[col].skew()}")

In [None]:
# 이산형 변수

fig , ax = plt.subplots(3 , 3 , figsize = (17 , 13))
plt.subplots_adjust(hspace=0.5)

for idx in range(len(int_data.columns)):
    row = idx // 3
    col = idx % 3

    sns.countplot(
        x = int_data[int_data.columns[idx]] ,
        ax = ax[row][col] ,
        hue = y
    )

    ax[row][col].set_title(f"{int_data.columns[idx]} Distribution" , fontsize = 10)

# plt.savefig('int_plt.png')

In [None]:
# 범주형 변수

# user_id는 뺀다

fig , ax = plt.subplots(1 , 2 , figsize = (13 , 5))

for i in range(1 , 3):
    sns.countplot(
        x = obj_data[obj_data.columns[i]] ,
        ax = ax[i - 1] ,
        hue = y
    )

    ax[i - 1].set_title(f"{obj_data.columns[i]} Distribution" , fontsize = 10)

# plt.savefig('obj_plt.png')

In [None]:
# 이상치 확인

fig , ax = plt.subplots(1 , 3 , figsize = (13 , 8))

sns.boxplot(
    float_data.iloc[: , 0:3:2] ,
    ax = ax[0]
)

sns.boxplot(
    float_data.iloc[: , 1] ,
    ax = ax[1]
)

sns.boxplot(
    int_data ,
    ax = ax[2]
)

In [None]:
# 이상치 데이터 & 비율 계산하는 함수 정의

def outlier_process(df , col):

    q1 = np.percentile(df[col] , 25)
    q3 = np.percentile(df[col] , 75)

    IQR = q3 - q1

    upper_fence = q3 + 1.5 * IQR
    lower_fence = q1 - 1.5 * IQR

    data = df[(df[col] < lower_fence) | (df[col] > upper_fence)]

    return data

In [None]:
for col in list(float_data.columns) + list(int_data.columns):
    data = outlier_process(train , col)

    print(f"{col} Outlier Percentage : {len(data) / len(train) * 100}%")

In [None]:
# average_time_per_learning_session

data = outlier_process(train , 'average_time_per_learning_session')

outlier_float = data.select_dtypes('float')
outlier_int = data.select_dtypes('int')
outlier_obj = data.select_dtypes('object')

In [None]:
# 연속형 변수

fig , ax = plt.subplots(1 , 3 , figsize = (13 , 5))

for i in range(3):
    sns.histplot(
        data = outlier_float ,
        x = outlier_float.columns[i] ,
        kde = True ,
        ax = ax[i]
    )

    ax[i].set_title(f"{outlier_float.columns[i]} Distribution" , fontsize = 10)

# plt.savefig('float_plt.png')

In [None]:
# 이산형 변수

fig , ax = plt.subplots(3 , 3 , figsize = (17 , 13))
plt.subplots_adjust(hspace=0.5)

for idx in range(len(outlier_int.columns)):
    row = idx // 3
    col = idx % 3

    sns.countplot(
        x = outlier_int[outlier_int.columns[idx]] ,
        ax = ax[row][col]
    )

    ax[row][col].set_title(f"{outlier_int.columns[idx]} Distribution" , fontsize = 10)

# plt.savefig('int_plt.png')

In [None]:
# 범주형 변수

# user_id는 뺀다

fig , ax = plt.subplots(1 , 2 , figsize = (13 , 5))

for i in range(1 , 3):
    sns.countplot(
        x = outlier_obj[outlier_obj.columns[i]] ,
        ax = ax[i - 1]
    )

    ax[i - 1].set_title(f"{outlier_obj.columns[i]} Distribution" , fontsize = 10)

# plt.savefig('obj_plt.png')

### 상관 분석

In [None]:
sns.clustermap(
    float_data.corr() ,
    annot = True ,
    fmt = '.2f' ,
    cmap = 'summer_r'
)

In [None]:
# 분산팽창지수 계산

from statsmodels.stats.outliers_influence import variance_inflation_factor

data = train.select_dtypes(['float'])

vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(data.values , i) for i in range(data.shape[1])]
vif['features'] = data.columns

vif = vif.sort_values(by = 'VIF Factor' , ascending = False)
vif = vif.reset_index().drop(columns = 'index')
vif

In [None]:
sns.scatterplot(
    data = float_data ,
    x = 'recent_learning_achievement' ,
    y = 'average_login_time'
)

In [None]:
# 크래머 계수

from scipy.stats import chi2_contingency

def cramers_V(var1 , var2):
    crosstab = np.array(pd.crosstab(var1 , var2 , rownames = None , colnames = None))
    stat = chi2_contingency(crosstab)[0]
    obs = np.sum(crosstab)
    mini = min(crosstab.shape) - 1

    return np.sqrt(stat / (obs * mini))

In [None]:
# 크래머 시각화 하기

rows = []
for var1 in int_data:
    col = []
    for var2 in int_data:
        cramers = cramers_V(int_data[var1] , int_data[var2])
        col.append(round(cramers , 2))
    rows.append(col)

cramers_result = np.array(rows)
df = pd.DataFrame(cramers_result , columns = int_data.columns , index = int_data.columns)

df

In [None]:
sns.clustermap(df.iloc[1: , 1:] , annot = True , fmt = '.2f' , cmap = 'summer_r')

In [None]:
# 크래머 시각화 하기

rows = []
for var1 in obj_data.iloc[: , 1:]:
    col = []
    for var2 in obj_data.iloc[: , 1:]:
        cramers = cramers_V(obj_data[var1] , obj_data[var2])
        col.append(round(cramers , 2))
    rows.append(col)

cramers_result = np.array(rows)
df = pd.DataFrame(cramers_result , columns = obj_data.columns[1:] , index = obj_data.columns[1:])

df

In [None]:
sns.clustermap(
    df ,
    annot = True ,
    fmt = '.2f' ,
    cmap = 'summer_r'
)

In [None]:
# 순서형 사이의 상관

# community_engagement_level , preferred_difficulty_level , subscription_type

sunso = train[['community_engagement_level' , 'preferred_difficulty_level' , 'subscription_type']]

rows = []
for var1 in sunso:
    cols = []
    for var2 in sunso:
        val = stats.spearmanr(sunso[var1] , sunso[var2]).correlation
        cols.append(val)
    rows.append(cols)


spearman_result = np.array(rows)
df = pd.DataFrame(spearman_result , columns = sunso.columns , index = sunso.columns)

In [None]:
sns.clustermap(
    df ,
    annot = True ,
    fmt = '.2f' ,
    cmap = 'summer_r'
)

In [None]:
# 연속형 순서형 상관

col = ['community_engagement_level' , 'preferred_difficulty_level' , 'subscription_type'] + list(float_data.columns[:2])

data = train[col]

rows = []
for var1 in data:
    cols = []
    for var2 in data:
        cols.append(stats.spearmanr(data[var1] , data[var2]).correlation)
    rows.append(cols)

rows = np.array(rows)
df = pd.DataFrame(rows , columns = data.columns , index = data.columns)

df

In [None]:
sns.clustermap(
    df ,
    annot = True ,
    fmt = '.2f' ,
    cmap = 'summer_r'
)

## 전처리

In [None]:
# user_id 제거하기

train.drop(['user_id'] , axis = 1 , inplace = True)
test.drop(['user_id'] , axis = 1 , inplace = True)

In [None]:
"""
최종 Drop 컬럼

recent_learning_achievement
community_engagement_level
"""

In [None]:
# 인코딩

mapp = {
    'Low' : 0 ,
    'Medium' : 1 ,
    'High' : 2
}

train['preferred_difficulty_level'] = train['preferred_difficulty_level'].map(mapp)
test['preferred_difficulty_level'] = test['preferred_difficulty_level'].map(mapp)

mapp = {
    'Basic' : 0 ,
    'Premium' : 1
}

train['subscription_type'] = train['subscription_type'].map(mapp)
test['subscription_type'] = test['subscription_type'].map(mapp)

In [None]:
# skew 해결

data = train[['average_time_per_learning_session']]

sns.histplot(
    np.log1p(data) ,
    kde = True
)

print(np.log1p(data).skew())

train['average_time_per_learning_session'] = np.log1p(train['average_time_per_learning_session'])
test['average_time_per_learning_session'] = np.log1p(test['average_time_per_learning_session'])

In [None]:
y = train['target']
train.drop(['target'] , axis = 1 , inplace = True)

### 전처리 추가 (BaselineModel 이후)

In [None]:
# binning

binning_train = train[['total_completed_courses' , 'abandoned_learning_sessions' , 'customer_inquiry_history']]
binning_test = test[['total_completed_courses' , 'abandoned_learning_sessions' , 'customer_inquiry_history']]

In [None]:
# 데이터 범위 확인

for col in binning_train.columns:
    print(f"{col} Range : {pd.cut(binning_train[col] , 3).unique()}\n")

In [None]:
# total_completed_courses

train['total_completed_courses'] = np.where(
    (0 <= train['total_completed_courses']) & (train['total_completed_courses'] <= 10) , 0 , np.where(
        (10 < train['total_completed_courses']) & (train['total_completed_courses'] <= 18) , 1 , 2
    )
)

In [None]:
# abandoned_learning_sessions

train['abandoned_learning_sessions'] = np.where(
    (0 <= train['abandoned_learning_sessions']) & (train['abandoned_learning_sessions'] <= 4) , 0 , np.where(
        (4 < train['abandoned_learning_sessions']) & (train['abandoned_learning_sessions'] <= 8) , 1 , 2
    )
)

In [None]:
# customer_inquiry_history

train['customer_inquiry_history'] = np.where(
    (0 <= train['customer_inquiry_history']) & (train['customer_inquiry_history'] <= 3) , 0 , np.where(
        (3 < train['customer_inquiry_history']) & (train['customer_inquiry_history'] <= 6) , 1 , 2
    )
)

In [None]:
for col in binning_test.columns:
    print(f"{col} Range : {pd.cut(binning_test[col] , 3).unique()}\n")

In [None]:
# total_completed_courses (test)

test['total_completed_courses'] = np.where(
    (0 <= test['total_completed_courses']) & (test['total_completed_courses'] <= 10) , 0 , np.where(
        (10 < test['total_completed_courses']) & (test['total_completed_courses'] <= 19) , 1 , 2
    )
)

In [None]:
# abandoned_learning_sessions

test['abandoned_learning_sessions'] = np.where(
    (0 <= test['abandoned_learning_sessions']) & (test['abandoned_learning_sessions'] <= 4) , 0 , np.where(
        (4 < test['abandoned_learning_sessions']) & (test['abandoned_learning_sessions'] <= 7) , 1 , 2
    )
)

In [None]:
# customer_inquiry_history

test['customer_inquiry_history'] = np.where(
    (0 <= test['customer_inquiry_history']) & (test['customer_inquiry_history'] <= 3) , 0 , np.where(
        (3 < test['customer_inquiry_history']) & (test['customer_inquiry_history'] <= 7) , 1 , 2
    )
)

In [None]:
binning_train = train[['total_completed_courses' , 'abandoned_learning_sessions' , 'customer_inquiry_history']]
binning_test = test[['total_completed_courses' , 'abandoned_learning_sessions' , 'customer_inquiry_history']]

In [None]:
# Time binning

# subscription_duration recent_login_time monthly_active_learning_days

t_data = train[['subscription_duration' , 'recent_login_time' , 'monthly_active_learning_days']]
t_test_data = test[['subscription_duration' , 'recent_login_time' , 'monthly_active_learning_days']]

for col in t_data.columns:
    print(f"{col} Range : {pd.cut(t_data[col] , 3).unique()}\n")

In [None]:
for col in t_test_data.columns:
    print(f"{col} Range : {pd.cut(t_test_data[col] , 3).unique()}\n")

In [None]:
# time train & test data

train['subscription_duration'] = np.where(
    (0 <= train['subscription_duration']) & (train['subscription_duration'] <= 8) , 0 , np.where(
        (8 < train['subscription_duration']) & (train['subscription_duration'] <= 16) , 1 , 2
    )
)

train['recent_login_time'] = np.where(
    (0 <= train['recent_login_time']) & (train['recent_login_time'] <= 10) , 0 , np.where(
        (10 < train['recent_login_time']) & (train['recent_login_time'] <= 20) , 1 , 2
    )
)

train['monthly_active_learning_days'] = np.where(
    (0 <= train['monthly_active_learning_days']) & (train['monthly_active_learning_days'] <= 9) , 0 , np.where(
        (9 < train['monthly_active_learning_days']) & (train['monthly_active_learning_days'] <= 16) , 1 , 2
    )
)

test['subscription_duration'] = np.where(
    (0 <= test['subscription_duration']) & (test['subscription_duration'] <= 8) , 0 , np.where(
        (8 < test['subscription_duration']) & (test['subscription_duration'] <= 16) , 1 , 2
    )
)

test['recent_login_time'] = np.where(
    (0 <= test['recent_login_time']) & (test['recent_login_time'] <= 10) , 0 , np.where(
        (10 < test['recent_login_time']) & (test['recent_login_time'] <= 20) , 1 , 2
    )
)

test['monthly_active_learning_days'] = np.where(
    (0 <= test['monthly_active_learning_days']) & (test['monthly_active_learning_days'] <= 9) , 0 , np.where(
        (9 < test['monthly_active_learning_days']) & (test['monthly_active_learning_days'] <= 16) , 1 , 2
    )
)

In [None]:
t_data = train[['subscription_duration' , 'recent_login_time' , 'monthly_active_learning_days']]
t_test_data = test[['subscription_duration' , 'recent_login_time' , 'monthly_active_learning_days']]

In [None]:
# binning 처리 빈도 확인

fig , ax = plt.subplots(2 , 3 , figsize = (12 , 8))

fig.subplots_adjust(wspace = 0.5)

row = 0

for data in[binning_train , binning_test]:
    for col in range(len(data.columns)):
        sns.countplot(
            data = data ,
            x = data.columns[col] ,
            ax = ax[row][col]
        )
    row += 1

In [None]:
fig , ax = plt.subplots(2 , 3 , figsize = (12 , 8))

fig.subplots_adjust(wspace = 0.5)

row = 0

for data in[t_data , t_test_data]:
    for col in range(len(data.columns)):
        sns.countplot(
            data = data ,
            x = data.columns[col] ,
            ax = ax[row][col]
        )
    row += 1

In [None]:
# 스케일링

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_train = scaler.fit_transform(train)
scaled_train = pd.DataFrame(
    scaled_train ,
    columns = train.columns
)

In [None]:
scaled_test = scaler.transform(test)
scaled_test = pd.DataFrame(
    scaled_test ,
    columns = test.columns
)

In [None]:
# 불균형 데이터 처리

from imblearn.over_sampling import SMOTE

smote = SMOTE()

print(f"SMOTE 전 데이터 크기 : {scaled_train.shape}")

smote_train , y = smote.fit_resample(scaled_train , y)

print(f"SMOTE 후 데이터 크기 : {smote_train.shape}")

### PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 13)
pca_array = pca.fit_transform(smote_train)

pca_df = pd.DataFrame(
    pca_array ,
    index = smote_train.index ,
    columns = [f"factor{i}" for i in range(1 , smote_train.shape[1] + 1)]
)

In [None]:
pca_analysis = pd.DataFrame(
    {
        '설명 가능한 분산 비율' : pca.explained_variance_ ,
        '기여율' : pca.explained_variance_ratio_
    } ,
    index = np.array([f"pca{i}" for i in range(1 , smote_train.shape[1] + 1)])
)

pca_analysis['누적 기여율'] = pca_analysis['기여율'].cumsum()
pca_analysis

In [None]:
# Scree Plot

sns.lineplot(
    x = [i for i in range(len(pca_analysis.index))] ,
    y = pca_analysis['기여율'] ,
    marker = 'o'
)

plt.title('Scree plot')

In [None]:
# 위에서 기여율로 볼 경우 2개까지 주성분 선택이 가능
# 학습데이터 , 테스트데이터 둘 다 PCA 적용 시켜서 속성을 두개로 줄이기

pca = PCA(n_components = 10)
pca_train = pca.fit_transform(smote_train)
pca_test = pca.transform(test)

pca_train = pd.DataFrame(
    pca_train , index = smote_train.index ,
    columns = [f"PCA {i}" for i in range(1 , 11)]
)
pca_test = pd.DataFrame(
    pca_test , index = test.index ,
    columns = [f"PCA {i}" for i in range(1 , 11)]
)

## BaselineModel 제작

In [None]:
# RandomForestClssifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score , recall_score , precision_score , confusion_matrix

from xgboost import XGBClassifier

import lightgbm

In [None]:
# 테스트 함수 제작

def model_test(classifier , trainX , trainY , validX , validY):

    classifier.fit(trainX , trainY)

    prediction = classifier.predict(validX)

    print(f"macro f1 : {f1_score(validY , prediction , average = 'macro')}")
    print(f"recall : {recall_score(validY , prediction , average = 'macro')}")
    print(f"precision : {precision_score(validY , prediction , average = 'macro')}")

    print(f"Train Score : {classifier.score(trainX , trainY)}")
    print(f"Valid Score : {classifier.score(validX , validY)}")

In [None]:
trainX , validX , trainY , validY = train_test_split(pca_train , y , test_size = 0.2 , random_state = 42)

In [None]:
lg_model = LogisticRegression()
lg_model.fit(trainX , trainY)

In [None]:
# prediction

prediction = lg_model.predict(validX)

In [None]:
# 검증

print(f"macro f1 : {f1_score(validY , prediction , average = 'macro')}")
print(f"recall : {recall_score(validY , prediction , average = 'macro')}")
print(f"precision : {precision_score(validY , prediction , average = 'macro')}")

In [None]:
print(f"Train Score : {lg_model.score(trainX , trainY)}")
print(f"Valid Score : {lg_model.score(validX , validY)}")

In [None]:
# Logistic Regression 중요 변수

coef_data = pd.Series(
    lg_model.coef_[0] ,
    index = trainX.columns
)

coef_data = coef_data.sort_values(ascending = False)
coef_data

In [None]:
# 중요 변수 시각화 (Logistic Regression)

sns.barplot(
    x = coef_data.values ,
    y = coef_data.index
)

In [None]:
# submission

submission = pd.read_csv(path + '/sample_submission.csv')

In [None]:
submission_prediction = lg_model.predict(pca_test)

In [None]:
submission['target'] = submission_prediction

In [None]:
# csv파일로 저장

submission.to_csv(path + '/submission.csv' , index = False)

### Feature Selection

In [None]:
!pip install eli5

In [None]:
import eli5

from eli5.sklearn import PermutationImportance
from sklearn.ensemble import RandomForestClassifier

permutation = PermutationImportance(lg_model , scoring = 'accuracy' , random_state = 42).fit(validX , validY)
eli5.show_weights(permutation , feature_names = validX.columns.tolist())

## 모델링

In [None]:
# model import

from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier , GradientBoostingClassifier , ExtraTreesClassifier , VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV , cross_val_score , StratifiedKFold , learning_curve

In [None]:
kfold = StratifiedKFold(n_splits = 10)

### 교차검증 & 모델선택

In [None]:
# 교차 검증 & 모델 선택

random_state = 42
classifiers = []
classifiers.append(SVC(random_state = random_state))
classifiers.append(DecisionTreeClassifier(random_state = random_state))
classifiers.append(AdaBoostClassifier(random_state = random_state))
classifiers.append(RandomForestClassifier(random_state = random_state))
classifiers.append(ExtraTreesClassifier(random_state = random_state))
classifiers.append(GradientBoostingClassifier(random_state = random_state))
classifiers.append(MLPClassifier(random_state = random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))
classifiers.append(lightgbm.LGBMClassifier(random_state = random_state))
classifiers.append(XGBClassifier(random_state = random_state))

cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier , trainX , y = trainY , scoring = 'accuracy' , cv = kfold , n_jobs = 4))

cv_means = []
cv_std = []

for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame(
    {
        "CrossValMeans" : cv_means ,
        "CrossValErrors" : cv_std ,
        "Algorithm" : [
            'SVC' , 'DecisionTree' , 'AdaBoost' , 'RandomForest' , 'ExtraTrees' , 'GradientBoosting' ,
            'MultipleLayerPerceptron' , 'KNeighboors' , 'LogisticRegression' , 'LGBMClassifier' , 'XGBClassifier'
        ]
    }
)

g = sns.barplot(x = "CrossValMeans" , y = "Algorithm" , data = cv_res , palette = 'Set3' , orient = 'h' , **{'xerr' : cv_std})
plt.xlabel("Mean Accuracy")
plt.title("Cross Validation Scores")

In [None]:
# Gradient Boosting

GBC = GradientBoostingClassifier()

gb_param_grid = {
    'loss' : ['deviance'] ,
    'n_estimators' : [100 , 200 , 300] ,
    'learning_rate' : [0.1 , 0.05 , 0.01] ,
    'max_depth' : [4 , 8] ,
    'min_samples_leaf' : [100 , 150] ,
    'max_features' : [0.3 , 0.1]
}

gsGBC = GridSearchCV(GBC , param_grid = gb_param_grid , cv = kfold , scoring = 'accuracy' ,
                     n_jobs = 4 , verbose = 1)

gsGBC.fit(trainX , trainY)

GBC_best = gsGBC.best_estimator_

In [None]:
gsGBC.best_score_

In [None]:
# KNeighborsClassifier

KN = KNeighborsClassifier()

kn_param_grid = {
    'n_neighbors' : [3 , 5 , 7 , 9 , 11 , 13] ,
    'weights' : ['uniform' , 'distance'] ,
    'metric' : ['minkowski' , 'euclidean' , 'manhattan']
}

gsKN = GridSearchCV(KN , param_grid = kn_param_grid , cv = kfold , scoring = 'accuracy' ,
                    n_jobs = 4 , verbose = 1)

gsKN.fit(trainX , trainY)

KN_best = gsKN.best_estimator_

In [None]:
gsKN.best_score_

In [None]:
# Logistic Regression

LR = LogisticRegression()

lr_param_grid = {
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
}

gsLR = GridSearchCV(LR , param_grid = lr_param_grid , cv = kfold , scoring = 'accuracy' ,
                    n_jobs = 4 , verbose = 1)

gsLR.fit(trainX , trainY)

LR_best = gsLR.best_estimator_

In [None]:
gsLR.best_score_

In [None]:
# XGBClassifier

XGB = XGBClassifier()

xgb_param_grid = {
    'n_estimators' : [100 , 200 , 300] ,
    'learning_rate' : [0.001 , 0.01 , 0.1 , 0.2] ,
    'min_child_weight' : [1 , 3 , 5 , 7] ,
    'max_depth' : [3 , 5 , 7 , 9] ,
    'gamma' : [-1 , 0 , 1]
}

gsXGB = GridSearchCV(XGB , param_grid = xgb_param_grid , cv = kfold , scoring = 'accuracy' ,
                     n_jobs = 4 , verbose = 1)

gsXGB.fit(trainX , trainY)

XGB_best = gsXGB.best_estimator_

In [None]:
gsXGB.best_score_

In [None]:
# lightGBM

l_gbm = lightgbm.LGBMClassifier()

l_gbm_param_grid = {
    'n_estimators' : [100 , 200 , 300] ,
    'learning_rate' : [0.001 , 0.01 , 0.1 , 0.2] ,
    'min_child_weight' : [1 , 3 , 5 , 7] ,
    'max_depth' : [3 , 5 , 7 , 9] ,
    'gamma' : [-1 , 0 , 1]
}

gsL = GridSearchCV(l_gbm , param_grid = l_gbm_param_grid , cv = kfold , scoring = 'accuracy' ,
                   n_jobs = 4 , verbose = 1)

gsL.fit(trainX , trainY)

L_best = gsL.best_estimator_

In [None]:
gsL.best_score_

In [None]:
import joblib

joblib.dump(GBC_best , '/content/drive/MyDrive/dacon/gradient.h5')
joblib.dump(KN_best , '/content/drive/MyDrive/dacon/KN.h5')
joblib.dump(LR_best , '/content/drive/MyDrive/dacon/LR.h5')
joblib.dump(XGB_best , '/content/drive/MyDrive/dacon/XGB.h5')

In [None]:
# model import (tuning model)

GBC_best = joblib.load('/content/drive/MyDrive/dacon/gradient.h5')
KN_best = joblib.load('/content/drive/MyDrive/dacon/KN.h5')
LR_best = joblib.load('/content/drive/MyDrive/dacon/LR.h5')
XGB_best = joblib.load('/content/drive/MyDrive/dacon/XGB.h5')

In [None]:
# 학습 진행 시각화

def plot_learning_curve(estimator , title , X , y , ylim = None , cv = None ,
                        n_jobs = -1 , train_sizes = np.linspace(.1 , 1.0 , 5)):

                        plt.figure()
                        plt.title(title)

                        if ylim is not None:
                            plt.ylim(*ylim)

                        plt.xlabel('Training examples')
                        plt.ylabel('Score')

                        train_sizes , train_scores , test_scores = learning_curve(
                            estimator , X , y , cv = cv , n_jobs = n_jobs , train_sizes = train_sizes
                        )
                        train_scores_mean = np.mean(train_scores , axis = 1)
                        train_scores_std = np.std(train_scores , axis = 1)
                        test_scores_mean = np.mean(test_scores , axis = 1)
                        test_scores_std = np.std(test_scores , axis = 1)

                        plt.grid()

                        plt.fill_between(train_sizes , train_scores_mean - train_scores_std ,
                                         train_scores_mean + train_scores_std , alpha = 0.1 ,
                                         color = 'r')
                        plt.fill_between(train_sizes , test_scores_mean - test_scores_std ,
                                         test_scores_mean + test_scores_std , alpha = 0.1 ,
                                         color = 'g')

                        plt.plot(train_sizes , train_scores_mean , 'o-' , color = 'r' ,
                                 label = 'Training score')

                        plt.plot(train_sizes , test_scores_mean , 'o-' , color = 'g' ,
                                 label = 'Cross-validation score')

                        plt.legend(loc = 'best')

                        return plt

g = plot_learning_curve(GBC_best , "GradientBoosting learning curves" , trainX , trainY , cv = kfold)
g = plot_learning_curve(KN_best , "KNeighbors learning curves" , trainX , trainY , cv = kfold)
g = plot_learning_curve(LR_best , "Logistic learning curves" , trainX , trainY , cv = kfold)
g = plot_learning_curve(XGB_best , "XGB learning curves" , trainX , trainY , cv = kfold)
g = plot_learning_curve(L_best , "lightGBM learning curves" , trainX , trainY , cv = kfold)

### Ensemble modeling

In [None]:
votingC = VotingClassifier(
    estimators = [
        ('GBC' , GBC_best) ,
        ('LR' , LR_best)
    ] ,
    voting = 'soft' , n_jobs = 4
)

In [None]:
# fitting

votingC.fit(trainX , trainY)

In [None]:
# votingC 모델 저장

joblib.dump(votingC , '/content/drive/MyDrive/dacon/2023.11.23/votingC_ver3.h5')

In [None]:
prediction = votingC.predict(validX)

In [None]:
print(f"macro f1 : {f1_score(validY , prediction , average = 'macro')}")
print(f"recall : {recall_score(validY , prediction , average = 'macro')}")
print(f"precision : {precision_score(validY , prediction , average = 'macro')}")

In [None]:
print(f"Voting model {votingC.score(trainX , trainY)}")
print(f"Voting model (test) {votingC.score(validX , validY)}")

In [None]:
cm = confusion_matrix(validY , prediction)

sns.heatmap(
    cm ,
    annot = True ,
    fmt = ".0f" ,
    cmap = 'summer_r'
)

plt.title("Confusion Matrix")

### Submission

In [None]:
# submission

submission = pd.read_csv(path + '/sample_submission.csv')

In [None]:
submission_prediction = votingC.predict(pca_test)

In [None]:
submission['target'] = submission_prediction

In [None]:
# csv파일로 저장

submission.to_csv(path + '/submission.csv' , index = False)