In [None]:
from google.colab import drive , files
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score , recall_score , precision_score , f1_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

import missingno as msno
import warnings
warnings.filterwarnings('ignore')

sns.set(style = 'whitegrid' , context = 'notebook' , palette = 'deep')

In [None]:
def metrics(y_true , y_pred):
    
    print('Accuracy : {}'.format(accuracy_score(y_true , y_pred)))
    print('Recall : {}'.format(recall_score(y_true , y_pred , average = 'macro')))
    print('Precision : {}'.format(precision_score(y_true , y_pred , average = 'macro')))
    print('f1 : {}'.format(f1_score(y_true , y_pred , average = 'macro')))

# 데이터 분석 Competition (Signate)

- 1. 데이터 파악

- 2. EDA + 전처리

- 3. 베이스라인 모델 설계

- 4. Feature Engineering

- 5. 예측 모델 구현

- 6. 튜닝작업

In [None]:
path = '/content/drive/MyDrive/signate_beginner_challenge/2023.06.07/'

In [None]:
train_df = pd.read_csv(path + "train.csv")
test_df = pd.read_csv(path + "test.csv")

## 1. 데이터 파악

In [None]:
train_df.head(3)

In [None]:
train_df.info()
print()
train_df.describe()

In [None]:
print(train_df.skew() , "\n")
print(train_df.kurtosis())

In [None]:
msno.matrix(train_df , color = (0.27, 0.52, 1.0))

In [None]:
# Target 변수 파악하기

f , ax = plt.subplots(1 , 2 , figsize = (18 , 8))
train_df['charges'].value_counts().plot.pie(explode = [0 , 0.1 , 0.2] , autopct = '%1.1f%%' ,
                                            ax = ax[0] , shadow = True)
ax[0].set_title('charges')
ax[0].set_ylabel('')

sns.countplot(x = 'charges' , data = train_df , ax = ax[1])
ax[1].set_title('charges')

plt.show()

In [None]:
# 수치형 데이터
f , ax = plt.subplots(1 , 2 , figsize = (18 , 8))
sns.histplot(x = 'bmi' , data = train_df , ax = ax[0] , kde = True)
ax[0].set_title('BMI')

sns.boxplot(x = 'bmi' , data = train_df , ax = ax[1])
ax[1].set_title('BMI boxplot')

In [None]:
# 범주형 데이터
f , ax = plt.subplots(2 , 2 , figsize = (18 , 10))

sns.countplot(x = 'sex' , data = train_df , ax = ax[0 , 0])
ax[0 , 0].set_title('sex')

sns.countplot(x = 'children' , data = train_df , ax = ax[0 , 1])
ax[0 , 1].set_title('children')

sns.countplot(x = 'smoker' , data = train_df , ax = ax[1 , 0])
ax[1 , 0].set_title('smoker')

sns.countplot(x = 'region' , data = train_df , ax = ax[1 , 1])
ax[1 , 1].set_title('region')

In [None]:
sns.histplot(x = 'age' , bins = 10 , data = train_df , kde = True)
plt.title('age')

## 2. EDA

In [None]:
# region and charges
# smoker and charges

f , ax = plt.subplots(1 , 2 , figsize = (18 , 8))

sns.countplot(x = 'region' , hue = 'charges' , data = train_df , ax = ax[0])
ax[0].set_title('region and charges')

sns.countplot(x = 'smoker' , hue = 'charges' , data = train_df , ax = ax[1])
ax[1].set_title('smoker and charges')

In [None]:
# sex and charges
# age and charges

f , ax = plt.subplots(1 , 2 , figsize = (18 , 8))

sns.countplot(x = 'sex' , data = train_df , hue = 'charges' , ax = ax[0])
ax[0].set_title('sex and charges')

sns.histplot(x = 'age' , data = train_df , hue = 'charges' , bins = 10 , ax = ax[1])
ax[1].set_title('age and charges')

In [None]:
# bmi 데이터 살펴보기

f , ax = plt.subplots(2 , 2 , figsize = (18 , 8))

sns.lineplot(x = 'age' , y = 'bmi' , data = train_df , ax = ax[0 , 0])
ax[0 , 0].set_title('age and bmi line plot')

sex_bmi_mean = train_df.groupby('sex').agg({'bmi' : 'mean'})
sns.barplot(x = sex_bmi_mean.index , y = 'bmi' , data = sex_bmi_mean , ax = ax[0 , 1])
ax[0 , 1].set_title('sex and bmi mean')

sns.boxplot(x = 'smoker' , y = 'bmi' , data = train_df , ax = ax[1 , 0])
ax[1 , 0].set_title('smoker and bmi boxplot')

bmi_groupby_region = train_df.groupby(['region']).agg({
    'bmi' : 'mean'
})

sns.barplot(x = bmi_groupby_region.index , y = 'bmi' , data = bmi_groupby_region , ax = ax[1 , 1])
ax[1 , 1].set_title('bmi_groupby_region(mean)')

plt.subplots_adjust(hspace = 0.5)

In [None]:
# heatmap
sns.heatmap(train_df.corr() , cmap = 'bone' , annot = True)

In [None]:
# 데이터 전처리

# 문자열 인코딩

le = LabelEncoder()

train_df['sex'] = le.fit_transform(train_df['sex'])
test_df['sex'] = le.fit_transform(test_df['sex'])

train_df['smoker'] = le.fit_transform(train_df['smoker'])
test_df['smoker'] = le.fit_transform(test_df['smoker'])

In [None]:
one_hot_train = pd.get_dummies(train_df['region'])
one_hot_test = pd.get_dummies(test_df['region'])

train_df = train_df.drop(['region'] , axis = 1)
test_df = test_df.drop(['region'] , axis = 1)

train_df = pd.concat([train_df , one_hot_train] , axis = 1)
test_df = pd.concat([test_df , one_hot_test] , axis = 1)

In [None]:
train_df['group'] = pd.qcut(train_df['age'] , 4)
train_df

In [None]:
test_df['group'] = pd.qcut(test_df['age'] , 4)
test_df

In [None]:
train_df.groupby(['group']).agg({
    'age' : 'count'
})

In [None]:
test_df.groupby(['group']).agg({
    'age' : 'count'
})

In [None]:
# 구간화 시켜주기

for idx in range(len(train_df)):
    
    age = train_df.iloc[idx]['age']

    if age > 17 and age <= 27:
        train_df.loc[idx , 'age'] = 0
    
    elif age > 27 and age <= 40:
        train_df.loc[idx , 'age'] = 1
    
    elif age > 40 and age <= 50:
        train_df.loc[idx , 'age'] = 2
    
    elif age > 50 and age <= 64:
        train_df.loc[idx , 'age'] = 3

In [None]:
train_df = train_df.drop(['group'] , axis = 1)

In [None]:
for idx in range(len(test_df)):

    age = test_df.iloc[idx]['age']
    
    if age > 17 and age <= 28:
        test_df.loc[idx , 'age'] = 0
    
    elif age >28 and age <= 40:
        test_df.loc[idx , 'age'] = 1
    
    elif age > 40 and age <= 50:
        test_df.loc[idx , 'age'] = 2

    elif age > 50 and age <= 64:
        test_df.loc[idx , 'age'] = 3

In [None]:
test_df = test_df.drop(['group'] , axis = 1)

In [None]:
train_df['group'] = pd.qcut(train_df['bmi'] , 4)
train_df

In [None]:
train_df.groupby(['group']).agg({
    'bmi' : 'count'
})

In [None]:
for idx in range(len(train_df)):
    
    bmi = train_df.iloc[idx]['bmi']

    if bmi > 20.627 and bmi <= 28.634:
        train_df.loc[idx , 'bmi'] = 0
    
    elif bmi > 28.634 and bmi <= 32.269:
        train_df.loc[idx , 'bmi'] = 1
    
    elif bmi > 32.269 and bmi <= 37.07:
        train_df.loc[idx , 'bmi'] = 2
    
    elif bmi > 37.07 and bmi <= 47.291:
        train_df.loc[idx , 'bmi'] = 3

In [None]:
test_df['group'] = pd.qcut(test_df['bmi'] , 4)
test_df

In [None]:
test_df.groupby(['group']).agg({
    'bmi' : 'count'
})

In [None]:
for idx in range(len(test_df)):
    
    bmi = test_df.iloc[idx]['bmi']

    if bmi > 21.848 and bmi <= 28.712:
        test_df.loc[idx , 'bmi'] = 0
    
    elif bmi > 28.712 and bmi <= 32.284:
        test_df.loc[idx , 'bmi'] = 1
    
    elif bmi > 32.284 and bmi <= 37.112:
        test_df.loc[idx , 'bmi'] = 2
    
    elif bmi > 37.112 and bmi <= 47:
        test_df.loc[idx , 'bmi'] = 3

In [None]:
train_df = train_df.drop(['group'] , axis = 1)
test_df = test_df.drop(['group'] , axis = 1)

train_df = train_df.astype({'bmi' : 'int'})
test_df = test_df.astype({'bmi' : 'int'})

In [None]:
train_df = train_df.drop(['id'] , axis = 1)
test_df = test_df.drop(['id'] , axis = 1)

## 3. 베이스라인 모델 설계

In [None]:
y = train_df['charges']
train_df = train_df.drop(['charges'] , axis = 1)

In [None]:
trainX , validX , trainY , validY = train_test_split(train_df , y , test_size = 0.3 , random_state = 42)

In [None]:
model = RandomForestClassifier(
    max_depth=10, 
    min_samples_leaf=1, 
    n_estimators=100, 
    n_jobs=-1, 
    random_state=42
)

model.fit(trainX , trainY)

In [None]:
print("Train Score : {}".format(round(model.score(trainX , trainY) , 3)))
print("Train Score : {}".format(round(model.score(validX , validY) , 3)))

In [None]:
y_pred = model.predict(validX)

metrics(validY , y_pred)

## 4. Feature Engineering

In [None]:
feature_importance = model.feature_importances_

sns.barplot(y = trainX.columns , x = feature_importance)
plt.title('feature_importance')

In [None]:
# 데이터 증강

smote = SMOTE(random_state = 42)

train_df_over , y_over = smote.fit_resample(train_df , y)

In [None]:
f , ax = plt.subplots(1 , 2 , figsize = (18 , 8))
pd.DataFrame(y_over , columns = ['charges'])['charges'].value_counts().plot.pie(explode = [0.05 , 0.05 , 0.05] , autopct = '%1.1f%%' ,
                                            ax = ax[0] , shadow = True)
ax[0].set_title('charges')
ax[0].set_ylabel('')

sns.countplot(x = 'charges' , data = pd.DataFrame(y_over , columns = ['charges']) , ax = ax[1])
ax[1].set_title('charges')

plt.show()

In [None]:
# SMOTE 사용 후 데이터 분할
trainX , validX , trainY , validY = train_test_split(train_df_over , y_over , test_size = 0.3 , random_state = 42)

## 5. 예측 모델 구현

In [None]:
model_lgb = LGBMClassifier()
model_xgb = XGBClassifier()
model_gbm = GradientBoostingClassifier()

In [None]:
# params

param_lgb = {
    "learning_rate" : [0.01,0.1,0.3],
    "max_depth" : [25, 50, 75],
    "num_leaves" : [100, 500 , 1200],
    "n_estimators" : [100,300,800]
}


param_xgb = {
    "max_depth": [10,30,50],
    "min_child_weight" : [1,3,6,10],
    "n_estimators": [200,300,500,1000]
}

param_gbm = {
    "max_depth" : [4,5,6,7,8,9,10],
    "learning_rate" : [0.01,0.1,0.2,0.3,0.4,0.5],
    "n_estimators" : [100,200,300,500]
}

In [None]:
gscv_lgb = GridSearchCV(estimator = model_lgb , param_grid = param_lgb , scoring = 'f1' , cv = 3 , n_jobs = -1 , verbose = True)
gscv_xgb = GridSearchCV(estimator = model_xgb , param_grid = param_xgb , scoring = 'f1' , cv = 3 , n_jobs = -1 , verbose = True)
gscv_gbm = GridSearchCV(estimator = model_gbm , param_grid = param_gbm , scoring = 'f1' , cv = 3 , n_jobs = -1 , verbose = True)

In [None]:
gscv_lgb.fit(trainX , trainY)
best_params_lgb = gscv_lgb.best_params_

In [None]:
gscv_xgb.fit(trainX , trainY)
best_params_xgb = gscv_xgb.best_params_

In [None]:
gscv_gbm.fit(trainX , trainY)
best_params_gbm = gscv_gbm.best_params_

In [None]:
print('LGBMClassifier best params : {}'.format(best_params_lgb))
print('XGBClassifier best params : {}'.format(best_params_xgb))
print('Gradient Boosting Classifier best params : {}'.format(best_params_gbm))

In [None]:
tuning_model_lgb = LGBMClassifier(**best_params_lgb)
tuning_model_xgb = XGBClassifier(**best_params_xgb)
tuning_model_gbm = GradientBoostingClassifier(**best_params_gbm)

In [None]:
# Ensemble model (Voting soft)

voting_model = VotingClassifier(
    estimators = [('lgb' , tuning_model_lgb) , ('xgb' , tuning_model_xgb) , ('gbm' , tuning_model_gbm)] ,
    voting = 'soft' , n_jobs = -1
)

In [None]:
# Voting model 학습

voting_model.fit(trainX , trainY)

In [None]:
pred = voting_model.predict(validX)

metrics(validY , pred)

## 결과 저장

In [None]:
submission = pd.read_csv(path + "sample_submit.csv" , header = None)

In [None]:
submit_prediction = voting_model.predict(test_df)

In [None]:
pd.DataFrame(submit_prediction , columns = ['result'])['result'].value_counts().plot.pie(
    explode = [0.05 , 0.05 , 0.05] , autopct = '%1.1f%%' , shadow = True
)
plt.ylabel('')

In [None]:
submission[1] = submit_prediction

In [None]:
submission.to_csv(path + 'submission.csv' , index = False)

In [None]:
files.download(path + 'submission.csv')