In [1]:
from google.colab import drive , files
drive.mount('/content/drive')

Mounted at /content/drive


## Final HW 데이터 분석

### 순서

- 1. 데이터 살펴보기
    
    - 1. 데이터 불러오기
    - 2. 데이터 정보 확인
    - 3. 통계적 측정
    - 4. 특성 선택

- 2. EDA

    - 1. Target 데이터
    - 2. 수치형 데이터
    - 3. 범주형 데이터
    - 4. 관계성

- 3. 최종 모델 설계하기

    - 1. GridSearchCV
    - 2. prediction

In [2]:
# import

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split , cross_val_score , StratifiedKFold , learning_curve
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier , AdaBoostClassifier , ExtraTreesClassifier
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score , f1_score , precision_score , recall_score

import missingno as msno
import warnings

pio.templates.default = 'plotly_dark'
warnings.filterwarnings('ignore')

In [3]:
!pip install -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [4]:
def plot_learning_curve(estimator , title , X , y , ylim = None , cv = None ,
                        n_jobs = -1 , train_sizes = np.linspace(.1 , 1.0 , 5)):

                        plt.figure()
                        plt.title(title)

                        if ylim is not None:
                            plt.ylim(*ylim)

                        plt.xlabel('Training examples')
                        plt.ylabel('Score')

                        train_sizes , train_scores , test_scores = learning_curve(
                            estimator , X , y , cv = cv , n_jobs = n_jobs , train_sizes = train_sizes
                        )
                        train_scores_mean = np.mean(train_scores , axis = 1)
                        train_scores_std = np.std(train_scores , axis = 1)
                        test_scores_mean = np.mean(test_scores , axis = 1)
                        test_scores_std = np.std(test_scores , axis = 1)

                        plt.grid()

                        plt.fill_between(train_sizes , train_scores_mean - train_scores_std ,
                                         train_scores_mean + train_scores_std , alpha = 0.1 ,
                                         color = 'r')
                        plt.fill_between(train_sizes , test_scores_mean - test_scores_std ,
                                         test_scores_mean + test_scores_std , alpha = 0.1 ,
                                         color = 'g')

                        plt.plot(train_sizes , train_scores_mean , 'o-' , color = 'r' ,
                                 label = 'Training score')

                        plt.plot(train_sizes , test_scores_mean , 'o-' , color = 'g' ,
                                 label = 'Cross-validation score')

                        plt.legend(loc = 'best')

                        return plt

In [5]:
def metrics(y_predict , y_valid):

    print(f"Accuracy Score : {accuracy_score(y_predict , y_valid)}")
    print(f"F1 Score : {f1_score(y_predict , y_valid)}")
    print(f"Precision Score : {precision_score(y_predict , y_valid)}")
    print(f"Recall Score : {recall_score(y_predict , y_valid)}")

## 1. 데이터 살펴보기

#### 1. 데이터 불러오기

In [64]:
# path
path = '/content/drive/MyDrive/GCI/Final_HW/dataset/ICorporation/'

In [65]:
df = pd.read_csv(path + 'data.csv')

In [66]:
org_data = pd.read_csv(path + 'data.csv')

#### 2. 데이터 정보 확인

In [9]:
# drop data 확인하기

for col in df.columns:

    length = len(df[col].value_counts())

    if length < 2:

        print(f'drop column : {col}')

drop column : EmployeeCount
drop column : Over18
drop column : StandardHours


In [10]:
# EmployeeCount , Over18 , StandardHours , EmployeeNumber

df.drop(['EmployeeCount' , 'Over18' , 'StandardHours' , 'EmployeeNumber'] , axis = 1 , inplace = True)

In [11]:
df.head(3)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyAchievement,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,HowToEmploy,Incentive,RemoteWork
0,25,No,Travel_Rarely,1280,Research & Development,7,1,Medical,4,Male,...,2,2,3,2,2,2,1,intern,0,4
1,27,No,Travel_Rarely,1167,Research & Development,4,2,Life Sciences,1,Male,...,5,2,3,5,3,0,3,intern,0,1
2,25,Yes,Travel_Rarely,240,Sales,5,3,Marketing,3,Male,...,6,1,3,6,4,0,3,agent_A,0,2


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyAchievement          1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EnvironmentSatisfaction   1470 non-null   int64 
 9   Gender                    1470 non-null   object
 10  HourlyAchievement         1470 non-null   int64 
 11  JobInvolvement            1470 non-null   int64 
 12  JobLevel                  1470 non-null   int64 
 13  JobRole                   1470 non-null   object
 14  JobSatisfaction         

#### 3. 통계적 측정

In [13]:
df.describe()

Unnamed: 0,Age,DailyAchievement,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyAchievement,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Incentive,RemoteWork
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,2.721769,65.891156,2.729932,2.063946,2.728571,6502.931293,...,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129,1183.386395,2.82449
std,9.135373,403.5091,8.106864,1.024165,1.093082,20.329428,0.711561,1.10694,1.102846,4707.956783,...,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136,1429.687536,1.229521
min,18.0,102.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,1009.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,2.0,48.0,2.0,1.0,2.0,2911.0,...,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,3.0,66.0,3.0,2.0,3.0,4919.0,...,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0,727.5,3.0
75%,43.0,1157.0,14.0,4.0,4.0,83.75,3.0,3.0,4.0,8379.0,...,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0,1817.0,4.0
max,60.0,1499.0,29.0,5.0,4.0,100.0,4.0,5.0,4.0,19999.0,...,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0,8584.0,5.0


In [14]:
# 왜도 , 첨도 확인

numerical_col = list(df.describe())
numerical_col

for col in numerical_col:

    print("{} Skewness : {:.3f} \n".format(col , df[col].skew()))

Age Skewness : 0.413 

DailyAchievement Skewness : -0.004 

DistanceFromHome Skewness : 0.958 

Education Skewness : -0.290 

EnvironmentSatisfaction Skewness : -0.322 

HourlyAchievement Skewness : -0.032 

JobInvolvement Skewness : -0.498 

JobLevel Skewness : 1.025 

JobSatisfaction Skewness : -0.330 

MonthlyIncome Skewness : 1.370 

MonthlyAchievement Skewness : 0.019 

NumCompaniesWorked Skewness : 1.026 

PercentSalaryHike Skewness : 0.821 

PerformanceRating Skewness : -0.065 

RelationshipSatisfaction Skewness : -0.303 

StockOptionLevel Skewness : 0.969 

TotalWorkingYears Skewness : 1.117 

TrainingTimesLastYear Skewness : 0.553 

WorkLifeBalance Skewness : -0.552 

YearsAtCompany Skewness : 1.765 

YearsInCurrentRole Skewness : 0.917 

YearsSinceLastPromotion Skewness : 1.984 

YearsWithCurrManager Skewness : 0.833 

Incentive Skewness : 1.640 

RemoteWork Skewness : -0.105 



In [15]:
for col in numerical_col:

    print("{} Kurtosis : {:.3f} \n".format(col , df[col].kurtosis()))

Age Kurtosis : -0.404 

DailyAchievement Kurtosis : -1.204 

DistanceFromHome Kurtosis : -0.225 

Education Kurtosis : -0.559 

EnvironmentSatisfaction Kurtosis : -1.203 

HourlyAchievement Kurtosis : -1.196 

JobInvolvement Kurtosis : 0.271 

JobLevel Kurtosis : 0.399 

JobSatisfaction Kurtosis : -1.222 

MonthlyIncome Kurtosis : 1.005 

MonthlyAchievement Kurtosis : -1.215 

NumCompaniesWorked Kurtosis : 0.010 

PercentSalaryHike Kurtosis : -0.301 

PerformanceRating Kurtosis : -1.032 

RelationshipSatisfaction Kurtosis : -1.185 

StockOptionLevel Kurtosis : 0.365 

TotalWorkingYears Kurtosis : 0.918 

TrainingTimesLastYear Kurtosis : 0.495 

WorkLifeBalance Kurtosis : 0.419 

YearsAtCompany Kurtosis : 3.936 

YearsInCurrentRole Kurtosis : 0.477 

YearsSinceLastPromotion Kurtosis : 3.613 

YearsWithCurrManager Kurtosis : 0.171 

Incentive Kurtosis : 2.896 

RemoteWork Kurtosis : -0.513 



#### 4. Feature Selection

In [69]:
# feature selection

# Target 데이터 분리

target = df['Attrition']
df.drop(['Attrition'] , axis = 1 , inplace = True)


df = df[['Age' , 'DailyAchievement' , 'DistanceFromHome' , 'EnvironmentSatisfaction' , 'HourlyAchievement' ,
         'JobLevel' , 'MonthlyIncome' , 'MonthlyAchievement' , 'NumCompaniesWorked' , 'OverTime' , 'TotalWorkingYears' ,
         'Incentive' , 'RemoteWork']]

df.head(3)

Unnamed: 0,Age,DailyAchievement,DistanceFromHome,EnvironmentSatisfaction,HourlyAchievement,JobLevel,MonthlyIncome,MonthlyAchievement,NumCompaniesWorked,OverTime,TotalWorkingYears,Incentive,RemoteWork
0,25,1280,7,4,64,1,2889,26897,1,No,2,0,4
1,27,1167,4,1,76,1,2517,3208,1,No,5,0,1
2,25,240,5,3,46,2,5744,26959,1,Yes,6,0,2


In [17]:
# 상관성 체크

fig = px.imshow(
    df.corr() ,
    text_auto = True ,
    aspect = 'auto' ,
    color_continuous_scale = 'Viridis'
)

fig.update_layout(
    title = '<b>Correlation coefficient</b>' ,
    title_font_size = 20
)

fig.show()

In [18]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

data = df.drop(['OverTime'] , axis = 1)
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(data.values , i) for i in range(data.shape[1])]
vif['features'] = data.columns
vif = vif.sort_values('VIF Factor').reset_index(drop = True)
vif

Unnamed: 0,VIF Factor,features
0,2.140902,Incentive
1,2.409221,NumCompaniesWorked
2,3.041625,DistanceFromHome
3,4.641783,DailyAchievement
4,5.572396,MonthlyAchievement
5,6.476189,EnvironmentSatisfaction
6,7.995399,RemoteWork
7,9.129413,HourlyAchievement
8,10.591245,TotalWorkingYears
9,23.811103,Age


In [19]:
data = df.drop(['OverTime' , 'TotalWorkingYears' , 'JobLevel' , 'Age'] , axis = 1)
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(data.values , i) for i in range(data.shape[1])]
vif['features'] = data.columns
vif = vif.sort_values('VIF Factor').reset_index(drop = True)
vif

Unnamed: 0,VIF Factor,features
0,2.130083,Incentive
1,2.19301,NumCompaniesWorked
2,2.88368,MonthlyIncome
3,3.027496,DistanceFromHome
4,4.482958,DailyAchievement
5,5.299393,MonthlyAchievement
6,5.941172,EnvironmentSatisfaction
7,7.53074,RemoteWork
8,7.592188,HourlyAchievement


In [20]:
df.drop(['TotalWorkingYears' , 'JobLevel' , 'Age'] , axis = 1 , inplace = True)

In [21]:
df.head(3)

Unnamed: 0,DailyAchievement,DistanceFromHome,EnvironmentSatisfaction,HourlyAchievement,MonthlyIncome,MonthlyAchievement,NumCompaniesWorked,OverTime,Incentive,RemoteWork
0,1280,7,4,64,2889,26897,1,No,0,4
1,1167,4,1,76,2517,3208,1,No,0,1
2,240,5,3,46,5744,26959,1,Yes,0,2


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   DailyAchievement         1470 non-null   int64 
 1   DistanceFromHome         1470 non-null   int64 
 2   EnvironmentSatisfaction  1470 non-null   int64 
 3   HourlyAchievement        1470 non-null   int64 
 4   MonthlyIncome            1470 non-null   int64 
 5   MonthlyAchievement       1470 non-null   int64 
 6   NumCompaniesWorked       1470 non-null   int64 
 7   OverTime                 1470 non-null   object
 8   Incentive                1470 non-null   int64 
 9   RemoteWork               1470 non-null   int64 
dtypes: int64(9), object(1)
memory usage: 115.0+ KB


## 2. EDA

#### 1. Target 데이터

In [75]:
# Target 데이터의 값 비율

fig = go.Figure()

fig.add_trace(
    go.Pie(
        labels = ['No' , 'Yes'] ,
        values = target.value_counts() ,
        hole = 0.5
    )
)

fig.update_layout(
    title_text = 'Target Data Pie plot' ,
    title_font_size = 20
)

fig.show()

#### 2. 수치형 데이터

In [24]:
df_int = df.select_dtypes(np.int)
df_obj = df.select_dtypes(np.object)

In [25]:
df_int.head(3)

Unnamed: 0,DailyAchievement,DistanceFromHome,EnvironmentSatisfaction,HourlyAchievement,MonthlyIncome,MonthlyAchievement,NumCompaniesWorked,Incentive,RemoteWork
0,1280,7,4,64,2889,26897,1,0,4
1,1167,4,1,76,2517,3208,1,0,1
2,240,5,3,46,5744,26959,1,0,2


In [26]:
# DailyAchievement

data = pd.concat([df_int['DailyAchievement'] , target] , axis = 1)
data.head(3)

Unnamed: 0,DailyAchievement,Attrition
0,1280,No
1,1167,No
2,240,Yes


In [27]:
fig = px.histogram(
    data ,
    x = 'DailyAchievement' ,
    color = 'Attrition' ,
    marginal = 'box'
)

fig.update_layout(
    title = '<b>DailyAchievement Histogram</b>' ,
    title_font_size = 20
)

fig.show()

In [28]:
# DistanceFromHome

data = pd.concat([df_int['DistanceFromHome'] , target] , axis = 1)
data = data.groupby(['DistanceFromHome' , 'Attrition']).size().reset_index(name = 'Count')

In [29]:
fig = px.line(
    data ,
    x = 'DistanceFromHome' ,
    y = 'Count' ,
    color = 'Attrition' ,
    markers = True
)

fig.show()

In [30]:
fig = px.histogram(df ,
                    x = 'DistanceFromHome' ,
                   marginal = 'box' ,
                    color = target)

fig.show()

In [31]:
# EnvironmentSatisfaction

data = pd.concat([df_int['EnvironmentSatisfaction'] , target] , axis = 1)

fig = go.Figure()

fig= make_subplots(rows=1, cols=2,
                  specs=[[{"type": "pie"}, {"type": "bar"}]])

fig.add_trace(
    go.Pie(
        labels = data['EnvironmentSatisfaction'].value_counts().index ,
        values = data['EnvironmentSatisfaction'].value_counts() ,
        hole = 0.5 ,
        legendgroup = '1'
    ) ,
    row = 1 ,
    col = 1
)

fig.update_layout(
    legend_yanchor = 'top' ,
    legend_y = 0.45
)

yes_data = data.loc[data['Attrition'] == 'Yes' , :]
no_data = data.loc[data['Attrition'] == 'No' , :]

fig.add_trace(
    go.Bar(
        x = yes_data['EnvironmentSatisfaction'].value_counts().index ,
        y = yes_data['EnvironmentSatisfaction'].value_counts() ,
        name = 'Yes' ,
        legendgroup = '2'
    ) ,
    row = 1 ,
    col = 2
)

fig.add_trace(
    go.Bar(
        x = no_data['EnvironmentSatisfaction'].value_counts().index ,
        y = no_data['EnvironmentSatisfaction'].value_counts() ,
        name = 'No' ,
        legendgroup = '2'
    ) ,
    row = 1 ,
    col = 2
)

fig.show()

In [32]:
pct_one = (len(data.loc[(data['Attrition'] == 'Yes') & (data['EnvironmentSatisfaction'] == 1)]) / len(data.loc[data['EnvironmentSatisfaction'] == 1])) * 100
pct_two = (len(data.loc[(data['Attrition'] == 'Yes') & (data['EnvironmentSatisfaction'] == 2)]) / len(data.loc[data['EnvironmentSatisfaction'] == 2])) * 100
pct_three = (len(data.loc[(data['Attrition'] == 'Yes') & (data['EnvironmentSatisfaction'] == 3)]) / len(data.loc[data['EnvironmentSatisfaction'] == 3])) * 100
pct_four = (len(data.loc[(data['Attrition'] == 'Yes') & (data['EnvironmentSatisfaction'] == 4)]) / len(data.loc[data['EnvironmentSatisfaction']] == 4)) * 100

In [33]:
data = [pct_one , pct_two , pct_three , pct_four]

for i in range(len(data)):

    print(f"EnvironmentSatisfaction \"{i + 1}\" Attrition percentage : {round(data[i] , 3)} %")

EnvironmentSatisfaction "1" Attrition percentage : 25.352 %
EnvironmentSatisfaction "2" Attrition percentage : 14.983 %
EnvironmentSatisfaction "3" Attrition percentage : 13.687 %
EnvironmentSatisfaction "4" Attrition percentage : 4.082 %


In [34]:
# HourlyAchievement

data = pd.concat([df_int['HourlyAchievement'] , target] , axis = 1)
data.head(3)

Unnamed: 0,HourlyAchievement,Attrition
0,64,No
1,76,No
2,46,Yes


In [35]:
fig = px.histogram(
    data ,
    x = 'HourlyAchievement' ,
    color = 'Attrition' ,
    marginal = 'box'
)

fig.update_layout(
    title = '<b>HourlyAchievement Histogram</b>' ,
    title_font_size = 20
)

fig.show()

In [36]:
# MonthlyIncome

data = pd.concat([df_int['MonthlyIncome'] , target] , axis = 1)
data.head(3)

Unnamed: 0,MonthlyIncome,Attrition
0,2889,No
1,2517,No
2,5744,Yes


In [37]:
fig = px.histogram(
    data ,
    x = 'MonthlyIncome' ,
    color = 'Attrition' ,
    marginal = 'box'
)

fig.update_layout(
    title = '<b>MonthlyIncome Histogram</b>' ,
    title_font_size = 20
)

fig.show()

In [38]:

# MonthlyAchievement

data = pd.concat([df_int['MonthlyAchievement'] , target] , axis = 1)
data.head(3)

Unnamed: 0,MonthlyAchievement,Attrition
0,26897,No
1,3208,No
2,26959,Yes


In [39]:
fig = px.histogram(
    data ,
    x = 'MonthlyAchievement' ,
    color = 'Attrition' ,
    marginal = 'box'
)

fig.update_layout(
    title_font_size = 20
)

fig.show()

In [40]:
# NumCompaniesWorked

data = pd.concat([df_int['NumCompaniesWorked'] , target] , axis = 1)
data.head(3)

Unnamed: 0,NumCompaniesWorked,Attrition
0,1,No
1,1,No
2,1,Yes


In [41]:
fig = px.histogram(
    data ,
    x = 'NumCompaniesWorked' ,
    color = 'Attrition' ,
    marginal = 'box'
)

fig.update_layout(
    title = '<b>NumCompaniesWorked Histogram</b>' ,
    title_font_size = 20
)

fig.show()

In [42]:
# Incentive

data = pd.concat([df_int['Incentive'] , target] , axis = 1)
data.head(3)

Unnamed: 0,Incentive,Attrition
0,0,No
1,0,No
2,0,Yes


In [43]:
fig = px.histogram(
    data ,
    x = 'Incentive' ,
    color = 'Attrition' ,
    marginal = 'box'
)

fig.update_layout(
    title = '<b>Incentive Histogram</b>' ,
    title_font_size = 20
)

fig.show()

In [44]:
# RemoteWork

data = pd.concat([df_int['RemoteWork'] , target] , axis = 1)
data.head(3)

Unnamed: 0,RemoteWork,Attrition
0,4,No
1,1,No
2,2,Yes


In [45]:
fig = go.Figure()

fig= make_subplots(rows=1, cols=2,
                  specs=[[{"type": "pie"}, {"type": "bar"}]])

fig.add_trace(
    go.Pie(
        labels = data['RemoteWork'].value_counts().index ,
        values = data['RemoteWork'].value_counts() ,
        hole = 0.5 ,
        legendgroup = '1'
    ) ,
    row = 1 ,
    col = 1
)

fig.update_layout(
    legend_yanchor = 'top' ,
    legend_y = 0.45
)

yes_data = data.loc[data['Attrition'] == 'Yes' , :]
no_data = data.loc[data['Attrition'] == 'No' , :]

fig.add_trace(
    go.Bar(
        x = yes_data['RemoteWork'].value_counts().index ,
        y = yes_data['RemoteWork'].value_counts() ,
        name = 'Yes' ,
        legendgroup = '2'
    ) ,
    row = 1 ,
    col = 2
)

fig.add_trace(
    go.Bar(
        x = no_data['RemoteWork'].value_counts().index ,
        y = no_data['RemoteWork'].value_counts() ,
        name = 'No' ,
        legendgroup = '2'
    ) ,
    row = 1 ,
    col = 2
)

fig.show()

In [46]:
pct_zero = (len(data.loc[(data['Attrition'] == 'Yes') & (data['RemoteWork'] == 0)]) / len(data.loc[data['RemoteWork'] == 0])) * 100
pct_one = (len(data.loc[(data['Attrition'] == 'Yes') & (data['RemoteWork'] == 1)]) / len(data.loc[data['RemoteWork'] == 1])) * 100
pct_two = (len(data.loc[(data['Attrition'] == 'Yes') & (data['RemoteWork'] == 2)]) / len(data.loc[data['RemoteWork'] == 2])) * 100
pct_three = (len(data.loc[(data['Attrition'] == 'Yes') & (data['RemoteWork'] == 3)]) / len(data.loc[data['RemoteWork']] == 3)) * 100
pct_four = (len(data.loc[(data['Attrition'] == 'Yes') & (data['RemoteWork'] == 4)]) / len(data.loc[data['RemoteWork']] == 4)) * 100
pct_five = (len(data.loc[(data['Attrition'] == 'Yes') & (data['RemoteWork'] == 5)]) / len(data.loc[data['RemoteWork']] == 5)) * 100

In [47]:
data = [pct_zero , pct_one , pct_two , pct_three , pct_four , pct_five]

for i in range(len(data)):

    print(f"RemoteWork \"{i}\" Attrition percentage : {round(data[i] , 3)} %")

RemoteWork "0" Attrition percentage : 50.0 %
RemoteWork "1" Attrition percentage : 22.642 %
RemoteWork "2" Attrition percentage : 16.279 %
RemoteWork "3" Attrition percentage : 4.082 %
RemoteWork "4" Attrition percentage : 3.197 %
RemoteWork "5" Attrition percentage : 0.612 %


#### 3. 범주형 데이터

In [48]:
# OverTime

data = pd.concat([df_obj['OverTime'] , target] , axis = 1)
data.head(3)

Unnamed: 0,OverTime,Attrition
0,No,No
1,No,No
2,Yes,Yes


In [49]:
fig = go.Figure()

fig= make_subplots(rows=1, cols=2,
                  specs=[[{"type": "pie"}, {"type": "bar"}]])

fig.add_trace(
    go.Pie(
        labels = data['OverTime'].value_counts().index ,
        values = data['OverTime'].value_counts() ,
        hole = 0.5 ,
        legendgroup = '1'
    ) ,
    row = 1 ,
    col = 1
)

fig.update_layout(
    legend_yanchor = 'top' ,
    legend_y = 0.45
)

yes_data = data.loc[data['Attrition'] == 'Yes' , :]
no_data = data.loc[data['Attrition'] == 'No' , :]

fig.add_trace(
    go.Bar(
        x = yes_data['OverTime'].value_counts().index ,
        y = yes_data['OverTime'].value_counts() ,
        name = 'Yes' ,
        legendgroup = '2'
    ) ,
    row = 1 ,
    col = 2
)

fig.add_trace(
    go.Bar(
        x = no_data['OverTime'].value_counts().index ,
        y = no_data['OverTime'].value_counts() ,
        name = 'No' ,
        legendgroup = '2'
    ) ,
    row = 1 ,
    col = 2
)

fig.show()

In [50]:
pct_no = (len(data.loc[(data['Attrition'] == 'Yes') & (data['OverTime'] == 'No')]) / len(data.loc[data['OverTime'] == 'No'])) * 100
pct_yes = (len(data.loc[(data['Attrition'] == 'Yes') & (data['OverTime'] == 'Yes')]) / len(data.loc[data['OverTime'] == 'Yes'])) * 100

In [51]:
data = [pct_no , pct_yes]
category = ['No' , 'Yes']

for i in range(len(data)):

    print(f"OverTime \"{category[i]}\" Attrition percentage : {round(data[i] , 3)} %")

OverTime "No" Attrition percentage : 10.436 %
OverTime "Yes" Attrition percentage : 30.529 %


#### 4. 관계성

In [70]:
# Monthly Income

data = pd.concat([df['MonthlyIncome'] ,
                  df['Incentive'] ,
                  target] , axis = 1)

data.head(3)

Unnamed: 0,MonthlyIncome,Incentive,Attrition
0,2889,0,No
1,2517,0,No
2,5744,0,Yes


In [73]:
fig = px.scatter(
    x = data['MonthlyIncome'] ,
    y = data['Incentive'] ,
    color = target
)

fig.update_xaxes(
    title = 'MonthlyIncome'
)

fig.update_yaxes(
    title = 'Incentive'
)

fig.update_layout(
    title = '<b>月収とインセンティブのscatterplot</b>' ,
    title_font_size = 20
)

fig.show()

In [54]:
data = pd.concat([df['MonthlyIncome'] , df['Incentive'] , df['OverTime']] , axis = 1)

fig = px.scatter(
    x = data['MonthlyIncome'] ,
    y = data['Incentive'] ,
    color = data['OverTime']
)

fig.update_xaxes(
    title = 'MonthlyIncome'
)

fig.update_yaxes(
    title = 'Incentive'
)

fig.show()

In [74]:
data = pd.concat([df['MonthlyIncome'] , df['Incentive'] , org_data['Age']] , axis = 1)

fig = px.scatter(
    x = data['MonthlyIncome'] ,
    y = data['Incentive'] ,
    color = data['Age']
)

fig.update_xaxes(
    title = 'MonthlyIncome' ,
    title_font_size = 15
)

fig.update_yaxes(
    title = 'Incentive' ,
    title_font_size = 15
)

fig.update_layout(
    title = '<b>月収とインセンティブの関係性（年齢）</b>' ,
    title_font_size = 20
)

fig.show()

In [56]:
### 위에서 보는 바와 같이 젊은 나이의 사람들이 많이 이직되며
### 그 원인으로는 monthlyincome , incentive에서 많이 보여지고 있으며
### satisfaction 부분에서는 그렇게까지 보이지 않고 있다.

### 회사 차원에서의 workshop같은 것도 괜찮을지도?

In [57]:
le = LabelEncoder()

df['OverTime'] = le.fit_transform(df['OverTime'])
target = le.fit_transform(target)

In [58]:
# SMOTE

smote = SMOTE()

x_train , y_train = df , target

x_train_resample , y_train_resample = smote.fit_resample(x_train , y_train)

df , target = x_train_resample , y_train_resample

In [59]:
x_train , x_valid , y_train , y_valid = train_test_split(df , target , test_size = 0.3 , random_state = 42)

## 3. 최종 모델 설계하기

In [60]:
kfold = StratifiedKFold(n_splits = 10)

In [61]:
GBC = GradientBoostingClassifier()

gb_param_grid = {
    'loss' : ['deviance'] ,
    'n_estimators' : [100 , 200 , 300] ,
    'learning_rate' : [0.1 , 0.05 , 0.01] ,
    'max_depth' : [4 , 8] ,
    'min_samples_leaf' : [100 , 150] ,
    'max_features' : [0.3 , 0.1]
}

gsGBC = GridSearchCV(GBC , param_grid = gb_param_grid , cv = kfold , scoring = 'accuracy' ,
                     n_jobs = 4 , verbose = 1)

gsGBC.fit(x_train , y_train)

GBC_best = gsGBC.best_estimator_

Fitting 10 folds for each of 72 candidates, totalling 720 fits


KeyboardInterrupt: ignored

In [None]:
gsGBC.best_score_

In [None]:
g = plot_learning_curve(gsGBC.best_estimator_ , 'GradientBoostingClassifier curves' , x_train , y_train , cv = kfold)

In [None]:
GBC_best.fit(x_train , y_train)

In [None]:
prediction = GBC_best.predict(x_valid)

In [None]:
print(f"Training Data Score : {GBC_best.score(x_train , y_train)}")
print(f"Test Data Score : {GBC_best.score(x_valid , y_valid)}")

In [None]:
metrics(prediction , y_valid)