In [None]:
from google.colab import drive , files
drive.mount('/content/drive')

## Signate Competition Data Analysis Project

### 1. 불러오기 and 데이터 체크
- 데이터 불러오기
- 통계값 측정하기
- 이상치 확인하기
- 결측값 확인하기

### 2. 특성 탐색
- 수치형 데이터
- 범주형 데이터

### 3. 데이터 전처리

### 4. Feature Engineering

### 5. 모델 설계
- CV 모델
- 하이퍼 패러미터 튜닝
- Ensemble modeling
- prediction

In [None]:
# 라이브러리 불러오기

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier , GradientBoostingClassifier , ExtraTreesClassifier , VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV , cross_val_score , StratifiedKFold , learning_curve
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score , f1_score

sns.set(style = 'white' , context = 'notebook' , palette = 'deep')

import missingno as msno
import warnings
warnings.filterwarnings('ignore')

### 1. 불러오기 and 데이터 체크

In [None]:
path = "/content/drive/MyDrive/signate_beginner_challenge/2023.06.07/"

In [None]:
# 데이터 불러오기

train_df = pd.read_csv(path + 'train.csv')
test_df = pd.read_csv(path + 'test.csv')

In [None]:
train_df.head(3)

In [None]:
train_df.info()
train_df.describe()

In [None]:
sns.heatmap(train_df.corr() , cmap = 'summer_r' , annot = True)

In [None]:
# 결측값 확인

g = msno.matrix(train_df)
g.set_title('Missing Value in train_df' , fontsize = 25)

In [None]:
# 이상치 확인
f , ax = plt.subplots(3 , 1 , figsize = (18 , 8))
plt.subplots_adjust(hspace = 0.5)

sns.boxplot(x = train_df['age'] , data = train_df , ax = ax[0])
ax[0].set_title('age boxplot' , fontsize = 15)
ax[0].set_xlabel('')

sns.boxplot(x = train_df['bmi'] , data = train_df , ax = ax[1])
ax[1].set_title('bmi boxplot' , fontsize = 15)
ax[1].set_xlabel('')

sns.boxplot(x = train_df['children'] , data = train_df , ax = ax[2])
ax[2].set_title('children boxplot' , fontsize = 15)
ax[2].set_xlabel('')

### 2. 특성 탐색

##### 2.1 수치형 데이터

###### 2.1.1 클래스 데이터

In [None]:
f , ax = plt.subplots(1 , 2 , figsize = (18 , 9))

train_df['charges'].value_counts().plot.pie(explode = [0 , 0.1 , 0.1] , autopct = '%.1f%%' , ax = ax[0])
ax[0].set_ylabel('')
ax[0].set_title('Percent charges' , fontsize = 20)

sns.countplot(x = 'charges' , data = train_df , ax = ax[1])
ax[1].set_title('Count charges' , fontsize = 20)

###### 2.1.2 나이

In [None]:
num_data = train_df[['age' , 'bmi' , 'children']]

In [None]:
f , ax = plt.subplots(1 , 2 , figsize = (18 , 8))

sns.histplot(x = 'age' , data = num_data , ax = ax[0])
ax[0].set_title('histplot AGE' , fontsize = 20)

sns.kdeplot(x = 'age' , data = num_data , ax = ax[1] , color = 'Green' , shade = True)
ax[1].set_title('kdeplot AGE' , fontsize = 20)

In [None]:
g = sns.FacetGrid(train_df , col = 'charges')
g = g.map(sns.histplot , 'age' , kde = True)

In [None]:
g = sns.kdeplot(train_df['age'][train_df['charges'] == 0] , color = 'Red' , shade = True)
g = sns.kdeplot(train_df['age'][train_df['charges'] == 1] , color = 'Green' , shade = True , ax = g)
g = sns.kdeplot(train_df['age'][train_df['charges'] == 2] , color = 'Blue' , shade = True , ax = g)

g = g.legend(['0' , '1' , '2'])

###### 2.1.3 BMI

In [None]:
sns.histplot(x = 'bmi' , data = num_data , kde = True)

###### 2.1.4 children

In [None]:
sns.countplot(x = 'children' , data = num_data)

##### 2.2 범주형 데이터

In [None]:
category_data = train_df[['sex' , 'smoker' , 'region']]

###### 2.2.1 sex

In [None]:
f , ax = plt.subplots(1 , 2 , figsize = (18 , 8))

category_data['sex'].value_counts().plot.pie(autopct = '%.1f%%' , ax = ax[0])
ax[0].set_title('percentage sex')
ax[0].set_ylabel('')

sns.countplot(x = 'sex' , data = category_data , ax = ax[1])
ax[1].set_title('barplot sex')

###### 2.2.2 smoker

In [None]:
f , ax = plt.subplots(1 , 2 , figsize = (18 , 8))

category_data['smoker'].value_counts().plot.pie(autopct = '%.1f%%' , ax = ax[0])
ax[0].set_title('percentage smoker')
ax[0].set_ylabel('')

sns.countplot(x = 'smoker' , data = category_data , ax = ax[1])
ax[1].set_title('barplot smoker')

###### 2.2.3 region

In [None]:
f , ax = plt.subplots(1 , 2 , figsize = (18 , 8))

category_data['region'].value_counts().plot.pie(autopct = '%.1f%%' , ax = ax[0])
ax[0].set_title('percentage region')
ax[0].set_ylabel('')

sns.countplot(x = 'region' , data = category_data , ax = ax[1])
ax[1].set_title('barplot region')

### 3. 데이터 전처리

##### 3.1 문자열 데이터 처리

In [None]:
target = train_df['charges']
train_df.drop(['charges'] , axis = 1 , inplace = True)

In [None]:
# sex and smoker
le = LabelEncoder()

train_df['sex'] = le.fit_transform(train_df['sex'])
train_df['smoker'] = le.fit_transform(train_df['smoker'])

In [None]:
f , ax = plt.subplots(2 , 2 , figsize = (18 , 8))

sns.countplot(x = 'sex' , data = train_df , ax = ax[0 , 0])
ax[0 , 0].set_title('countplot sex after LabelEncoding')

sns.countplot(x = 'smoker' , data = train_df , ax = ax[0 , 1])
ax[0 , 1].set_title('countplot smoker after LabelEncoding')

sns.heatmap(train_df.corr() , cmap = 'summer_r' , ax = ax[1 , 0] , annot = True)
ax[1 , 0].set_title('feature corr')

In [None]:
# region

region_onehot = pd.get_dummies(train_df['region'])
train_df = pd.concat([train_df,  region_onehot] , axis = 1)
train_df.drop(['region'] , inplace = True , axis = 1)

In [None]:
plt.figure(figsize = (15 , 15))
sns.heatmap(train_df.corr() , cmap = 'summer_r' , annot = True)

##### 3.2 불균형 데이터 처리

In [None]:
smote = SMOTE()

train_df , target = smote.fit_resample(train_df , target)

In [None]:
sns.countplot(x = 'charges' , data = pd.DataFrame({'charges' : target}))

###### 3.3 데이터 구간화

In [None]:
# children

data = pd.concat([train_df['children'] , pd.DataFrame({'charges' : target})] , axis = 1)

sns.countplot(x = 'children' , hue = 'charges' , data = data)

In [None]:
for idx in range(len(train_df)):

    if train_df.iloc[idx]['children'] != 0:

        train_df.loc[idx , 'children'] = 1
    else:

        train_df.loc[idx , 'children'] = 0

In [None]:
sns.countplot(x = 'children' , data = train_df)

In [None]:
# age

age_cut = pd.cut(train_df['age'] , 3)

In [None]:
sns.countplot(x = 'cut' , data = pd.DataFrame({'cut' : age_cut}))

In [None]:
for idx in range(len(train_df)):

    age = train_df.loc[idx , 'age']

    if 17.954 < age <= 33.333:
        train_df.loc[idx , 'age'] = 0

    elif 33.333 < age <= 48.667:
        train_df.loc[idx , 'age'] = 1

    else:
        train_df.loc[idx , 'age'] = 2

In [None]:
sns.countplot(x = 'age' , data = train_df)

In [None]:
# bmi

bmi_cut = pd.cut(train_df['bmi'] , 3)

In [None]:
sns.countplot(x = 'bmi' , data = pd.DataFrame({'bmi' : bmi_cut}))

In [None]:
for idx in range(len(train_df)):

    bmi = train_df.loc[idx , 'bmi']

    if 20.601 < bmi <= 29.515:
        train_df.loc[idx , 'bmi'] = 0

    elif 29.515 < bmi <= 38.403:
        train_df.loc[idx , 'bmi'] = 1

    else:
        train_df.loc[idx , 'bmi'] = 2

In [None]:
train_df = train_df.astype({'bmi' : 'int'})

### 4. Feature Engineering

##### 4.1 Smoker + Age

In [None]:
for idx in range(len(train_df)):

    smoker = train_df.loc[idx , 'smoker']

    if smoker == 0:

        train_df.loc[idx , 'No smoke age'] = smoker + train_df.loc[idx , 'age'] + 1
        train_df.loc[idx , 'smoke age'] = 0

    else:

        train_df.loc[idx , 'smoke age'] = smoker + train_df.loc[idx , 'age']
        train_df.loc[idx , 'No smoke age'] = 0

In [None]:
train_df = train_df.astype({'smoke age' : 'int'})
train_df = train_df.astype({'No smoke age' : 'int'})

### 5. 모델 설계

In [None]:
train_df = train_df.drop(['id'] , axis = 1)

In [None]:
X_train , X_test , Y_train , Y_test = train_test_split(train_df , target , random_state = 42 , test_size = 0.3)

In [None]:
kfold = StratifiedKFold(n_splits = 10)

In [None]:
random_state = 42
classifiers = []
classifiers.append(SVC(random_state = random_state))
classifiers.append(DecisionTreeClassifier(random_state = random_state))
classifiers.append(AdaBoostClassifier(random_state = random_state))
classifiers.append(RandomForestClassifier(random_state = random_state))
classifiers.append(ExtraTreesClassifier(random_state = random_state))
classifiers.append(GradientBoostingClassifier(random_state = random_state))
classifiers.append(MLPClassifier(random_state = random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))

cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier , X_train , y = Y_train , scoring = 'accuracy' , cv = kfold , n_jobs = 4))

cv_means = []
cv_std = []

for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame({
    "CrossValMeans" : cv_means ,
    "CrossValerrors" : cv_std ,
    "Algorithm" : [
        'SVC' , 'DecisionTree' , 'AdaBoost' , 'RandomForest' , 'ExtraTrees' , 'GradientBoosting' ,
        'MultipleLayerPerceptron' , 'KNeighboors' , 'LogisticRegression'
    ]
})

g = sns.barplot(x = 'CrossValMeans' , y = 'Algorithm' , data = cv_res , palette = 'Set3' , orient = 'h' , **{'xerr' : cv_std})
plt.xlabel('Mean Accuracy')
plt.title('Cross validation scores')

##### 5.1 하이퍼 패러미터 튜닝

In [None]:
LR = LogisticRegression()

LR_param_grid = {
    'solver': ['newton-cg' , 'sag' , 'saga' , 'lbfgs'] ,
    "C" :np.logspace(-3,3,7) ,
    "penalty":["l1","l2"]
}

gsLR = GridSearchCV(LR , param_grid = LR_param_grid , cv = kfold , scoring = 'accuracy' ,
                    n_jobs = 4 , verbose = 1)

gsLR.fit(X_train , Y_train)

LR_best = gsLR.best_estimator_

In [None]:
gsLR.best_score_

In [None]:
DTC = DecisionTreeClassifier()

adaDTC = AdaBoostClassifier(DTC , random_state = 7)

ada_param_grid = {
    'base_estimator__criterion' : ['gini' , 'entropy'] ,
    'base_estimator__splitter' : ['best' , 'random'] ,
    'algorithm' : ['SAMME' , 'SAMME.R'] ,
    'n_estimators' : [1 , 3] ,
    'learning_rate' : [0.0001 , 0.001 , 0.01 , 0.1 , 0.2 , 0.3 , 1.5]
}

gsadaDTC = GridSearchCV(adaDTC , param_grid = ada_param_grid , cv = kfold , scoring = 'accuracy' ,
                        n_jobs = 4 , verbose = 1)

gsadaDTC.fit(X_train , Y_train)

ada_best = gsadaDTC.best_estimator_

In [None]:
gsadaDTC.best_score_

In [None]:
ExtC = ExtraTreesClassifier()

ex_param_grid = {
    'max_depth' : [None] ,
    'max_features' : [1 , 3 , 10] ,
    'min_samples_split' : [2 , 3 , 10] ,
    'min_samples_leaf' : [1 , 3 , 10] ,
    'bootstrap' : [False] ,
    'n_estimators' : [100 , 300] ,
    'criterion' : ['gini']
}

gsExtC = GridSearchCV(ExtC , param_grid = ex_param_grid , cv = kfold , scoring = 'accuracy' ,
                      n_jobs = 4 , verbose = 1)

gsExtC.fit(X_train , Y_train)

ExtC_best = gsExtC.best_estimator_

In [None]:
gsExtC.best_score_

In [None]:
RFC = RandomForestClassifier()

rf_param_grid = {
    'max_depth' : [None] ,
    'max_features' : [1 , 3 , 10] ,
    'min_samples_split' : [2 , 3 , 10] ,
    'min_samples_leaf' : [1 , 3 , 10] ,
    'bootstrap' : [False] ,
    'n_estimators' : [100 , 300] ,
    'criterion' : ['gini']
}

gsRFC = GridSearchCV(RFC , param_grid = rf_param_grid , cv = kfold , scoring = 'accuracy' ,
                     n_jobs = 4 , verbose = 1)

gsRFC.fit(X_train , Y_train)

RFC_best = gsRFC.best_estimator_

In [None]:
gsRFC.best_score_

In [None]:
GBC = GradientBoostingClassifier()

gb_param_grid = {
    'loss' : ['deviance'] ,
    'n_estimators' : [100 , 200 , 300] ,
    'learning_rate' : [0.1 , 0.05 , 0.01] ,
    'max_depth' : [4 , 8] ,
    'min_samples_leaf' : [100 , 150] ,
    'max_features' : [0.3 , 0.1]
}

gsGBC = GridSearchCV(GBC , param_grid = gb_param_grid , cv = kfold , scoring = 'accuracy' ,
                     n_jobs = 4 , verbose = 1)

gsGBC.fit(X_train , Y_train)

GBC_best = gsGBC.best_estimator_

In [None]:
gsGBC.best_score_

In [None]:
SVMC = SVC(probability = True)

svc_param_grid = {'kernel' : ['rbf'] ,
                  'gamma' : [0.001 , 0.01 , 0.1 , 1] ,
                  'C' : [1 , 10 , 50 , 100 , 200 , 300 , 1000]}

gsSVMC = GridSearchCV(SVMC , param_grid = svc_param_grid , cv = kfold , scoring = 'accuracy' , n_jobs = 4 , verbose = 1)

gsSVMC.fit(X_train , Y_train)

SVMC_best = gsSVMC.best_estimator_

In [None]:
gsSVMC.best_score_

##### 5.2 Plot learning curves

In [None]:
def plot_learning_curve(estimator , title , X , y , ylim = None , cv = None ,
                        n_jobs = -1 , train_sizes = np.linspace(.1 , 1.0 , 5)):

                        plt.figure()
                        plt.title(title)

                        if ylim is not None:
                            plt.ylim(*ylim)

                        plt.xlabel('Training examples')
                        plt.ylabel('Score')

                        train_sizes , train_scores , test_scores = learning_curve(
                            estimator , X , y , cv = cv , n_jobs = n_jobs , train_sizes = train_sizes
                        )
                        train_scores_mean = np.mean(train_scores , axis = 1)
                        train_scores_std = np.std(train_scores , axis = 1)
                        test_scores_mean = np.mean(test_scores , axis = 1)
                        test_scores_std = np.std(test_scores , axis = 1)

                        plt.grid()

                        plt.fill_between(train_sizes , train_scores_mean - train_scores_std ,
                                         train_scores_mean + train_scores_std , alpha = 0.1 ,
                                         color = 'r')
                        plt.fill_between(train_sizes , test_scores_mean - test_scores_std ,
                                         test_scores_mean + test_scores_std , alpha = 0.1 ,
                                         color = 'g')

                        plt.plot(train_sizes , train_scores_mean , 'o-' , color = 'r' ,
                                 label = 'Training score')

                        plt.plot(train_sizes , test_scores_mean , 'o-' , color = 'g' ,
                                 label = 'Cross-validation score')

                        plt.legend(loc = 'best')

                        return plt

g = plot_learning_curve(gsRFC.best_estimator_ , 'RF mearning curves' , X_train , Y_train , cv = kfold)
g = plot_learning_curve(gsExtC.best_estimator_ , 'ExtraTrees learning curves' , X_train , Y_train , cv = kfold)
g = plot_learning_curve(gsSVMC.best_estimator_ , "SVC learning curves" , X_train , Y_train , cv = kfold)
g = plot_learning_curve(gsadaDTC.best_estimator_ , 'AdaBoost learning curves' , X_train , Y_train , cv = kfold)
g = plot_learning_curve(gsGBC.best_estimator_ , 'GradientBoosting learning curves' , X_train , Y_train , cv = kfold)
g = plot_learning_curve(gsLR.best_estimator_ , 'LogisticRegression learning curves' , X_train , Y_train , cv = kfold)

In [None]:
nrows = ncols = 2
f , ax = plt.subplots(nrows = nrows , ncols = ncols , sharex = 'all' , figsize = (15 , 15))

names_classifiers = [('AdaBoosting' , ada_best) , ('ExtraTrees' , ExtC_best) , ('RandomForest' , RFC_best) ,
                     ('GradientBoosting' , GBC_best)]

nclassifier = 0
for row in range(nrows):

    for col in range(ncols):

        name = names_classifiers[nclassifier][0]
        classifier = names_classifiers[nclassifier][1]
        indices = np.argsort(classifier.feature_importances_)[::-1][:40]
        g = sns.barplot(
            y = X_train.columns[indices][:40] , x = classifier.feature_importances_[indices][:40] ,
            orient = 'h' , ax = ax[row][col]
        )
        g.set_xlabel('Relative immportance' , fontsize = 12)
        g.set_ylabel('Features' , fontsize = 12)
        g.tick_params(labelsize = 9)
        g.set_title(name + 'feature importance')
        nclassifier += 1

##### 5.3 Ensemble modeling

In [None]:
votingC = VotingClassifier(estimators = [('rfc' , RFC_best) ,
                                         ('extc' , ExtC_best) ,
                                         ('adac' , ada_best) ,
                                         ('gbc' , GBC_best) ,
                                         ('svc' , SVMC_best) ,
                                         ('lr' , LR_best)] ,
                           voting = 'soft' , n_jobs = 4)

votingC = votingC.fit(X_train , Y_train)

##### 5.4 Prediction

In [None]:
prediction = votingC.predict(X_test)

In [None]:
# 0.8417329796640142
# 0.8366978618416522

In [None]:
print(accuracy_score(prediction , Y_test))
print(f1_score(prediction , Y_test , average = 'macro'))

##### 5.5 test data process

In [None]:
# sex and smoker
le = LabelEncoder()

test_df['sex'] = le.fit_transform(test_df['sex'])
test_df['smoker'] = le.fit_transform(test_df['smoker'])

In [None]:
f , ax = plt.subplots(2 , 2 , figsize = (18 , 8))

sns.countplot(x = 'sex' , data = test_df , ax = ax[0 , 0])
ax[0 , 0].set_title('countplot sex after LabelEncoding')

sns.countplot(x = 'smoker' , data = test_df , ax = ax[0 , 1])
ax[0 , 1].set_title('countplot smoker after LabelEncoding')

sns.heatmap(test_df.corr() , cmap = 'summer_r' , ax = ax[1 , 0] , annot = True)
ax[1 , 0].set_title('feature corr')

In [None]:
# region

region_onehot = pd.get_dummies(test_df['region'])
test_df = pd.concat([test_df,  region_onehot] , axis = 1)
test_df.drop(['region'] , inplace = True , axis = 1)

In [None]:
plt.figure(figsize = (15 , 15))
sns.heatmap(test_df.corr() , cmap = 'summer_r' , annot = True)

In [None]:
for idx in range(len(test_df)):

    if test_df.iloc[idx]['children'] != 0:

        test_df.loc[idx , 'children'] = 1
    else:

        test_df.loc[idx , 'children'] = 0

In [None]:
sns.countplot(x = 'children' , data = test_df)

In [None]:
# age

age_cut = pd.cut(test_df['age'] , 3)

In [None]:
sns.countplot(x = 'cut' , data = pd.DataFrame({'cut' : age_cut}))

In [None]:
for idx in range(len(test_df)):

    age = test_df.loc[idx , 'age']

    if 17.954 < age <= 33.333:
        test_df.loc[idx , 'age'] = 0

    elif 33.333 < age <= 48.667:
        test_df.loc[idx , 'age'] = 1

    else:
        test_df.loc[idx , 'age'] = 2

In [None]:
sns.countplot(x = 'age' , data = test_df)

In [None]:
# bmi

bmi_cut = pd.cut(test_df['bmi'] , 3)

In [None]:
sns.countplot(x = 'bmi' , data = pd.DataFrame({'bmi' : bmi_cut}))

In [None]:
for idx in range(len(test_df)):

    bmi = test_df.loc[idx , 'bmi']

    if 21.824 < bmi <= 30.196:
        test_df.loc[idx , 'bmi'] = 0

    elif 30.196 < bmi <= 38.543:
        test_df.loc[idx , 'bmi'] = 1

    else:
        test_df.loc[idx , 'bmi'] = 2

In [None]:
test_df = test_df.astype({'bmi' : 'int'})

In [None]:
for idx in range(len(test_df)):

    smoker = test_df.loc[idx , 'smoker']

    if smoker == 0:

        test_df.loc[idx , 'No smoke age'] = smoker + test_df.loc[idx , 'age'] + 1
        test_df.loc[idx , 'smoke age'] = 0

    else:

        test_df.loc[idx , 'smoke age'] = smoker + test_df.loc[idx , 'age']
        test_df.loc[idx , 'No smoke age'] = 0

In [None]:
test_df = test_df.astype({'smoke age' : 'int'})
test_df = test_df.astype({'No smoke age' : 'int'})

In [None]:
test_df.drop(['id'] , axis = 1 , inplace = True)

##### 5.6 submission

In [None]:
submission = pd.read_csv(path + 'sample_submit.csv' , header=None)

In [None]:
prediction = votingC.predict(test_df)

In [None]:
submission[1] = prediction

In [None]:
submission.to_csv(path+'submission.csv' , index=False)

In [None]:
files.download(path + 'submission.csv')