# 연습
- 타이타닉 데이터에 KNN을 통해 분류 진행
- 스케일러(표준화, 로버스트, 최소-최대)별 결과 비교

#### 스케일링 종류
|종류|설명
|-|-|
|표준화 스케일링 Standarad Scaling|평균이 0이 되고, 표준편차가 1이 되도록 데이터를 고르게 분포시키는 데 사용|
|로버스트 스케일링 Robust Scaling|데이터에 아웃라이어가 존재하고, 그 영향력을 그대로 유지하고 싶을 때 사용|
|최소-최대 스케일링 Min-Max Scaling|데이터 분포의 특성을 최대한 그대로 유지하고 싶을 때 사용
|정규화 스케일링Normalizer Scaling|행 기준의 스케일링이 필요할 때 사용하나, 실제로 거의 사용하지 않음

In [1]:
#@title Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{: .2f}'.format

In [2]:
file_url = "https://raw.githubusercontent.com/bigdata-young/bigdata_16th/main/data/"
sheep = pd.read_csv(f'{file_url}/titanic_train.csv', index_col=0)

In [3]:
sheep.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
#@title 정보 확인
sheep.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


- 결측치 유무 확인
  - 존재함('Age', 'Cabin', 'Embarked')
- 타입 통일성 확인
  - object 존재함 ('Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'

In [5]:
sheep.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.38,2.31,29.7,0.52,0.38,32.2
std,0.49,0.84,14.53,1.1,0.81,49.69
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.12,0.0,0.0,7.91
50%,0.0,3.0,28.0,0.0,0.0,14.45
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.33


- 이상치 확인
  - 존재하는것 같음 ('Fare' 최대가 512.33 인데 최소가 0)

In [6]:
#@title 결측치 채우기
sheep.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [7]:
sheep['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [8]:
sheep.Embarked = sheep.Embarked.fillna('S')

In [9]:
sheep['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [10]:
# 결측치가 너무 많은 컬럼 삭제
sheep.drop(columns='Cabin',inplace = True)
sheep

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.00,1,0,A/5 21171,7.25,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.28,C
3,1,3,"Heikkinen, Miss. Laina",female,26.00,0,0,STON/O2. 3101282,7.92,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.10,S
5,0,3,"Allen, Mr. William Henry",male,35.00,0,0,373450,8.05,S
...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.00,0,0,211536,13.00,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.00,0,0,112053,30.00,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,S
890,1,1,"Behr, Mr. Karl Howell",male,26.00,0,0,111369,30.00,C


In [11]:
# 'Title' 컬럼을 만들어서 'Name' 컬럼의 원하는 데이터 저장
sheep['Title'] = sheep.Name.str.extract('([A-Za-z]+)\.')
sheep['Title'].value_counts(), sheep['Title'].unique()
sheep.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C,Mrs
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,S,Miss
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Mrs
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Mr


In [12]:
sheep['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer'], dtype=object)

In [13]:
sheep['Title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: Title, dtype: int64

In [14]:
# value Dr ~ Jonkheer 까지 rare라는 value로 통합 하고 싶음
# value의 개수가 40개 이하면 rare로 바꿔줌
rarelist = []
# 반복문을 이용해서 
for a in list(sheep['Title']):
    if list(sheep['Title']).count(a) < 10:
        rarelist.append(a)
rarelist

['Don',
 'Rev',
 'Rev',
 'Dr',
 'Rev',
 'Dr',
 'Mme',
 'Dr',
 'Ms',
 'Major',
 'Major',
 'Lady',
 'Sir',
 'Rev',
 'Dr',
 'Mlle',
 'Col',
 'Dr',
 'Col',
 'Mlle',
 'Capt',
 'Countess',
 'Dr',
 'Dr',
 'Jonkheer',
 'Rev',
 'Rev']

In [15]:
sheep['Title'] = sheep['Title'].replace(rarelist, 'Rare')

In [16]:
sheep['Title'].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
Rare       27
Name: Title, dtype: int64

In [17]:
# 경칭에 대한 도메인지식을 통해 나이 유추? 가능
# 경칭별 나이 평균 구하기
sheep.groupby(['Title'])['Age'].mean()
# 경칭별 나이 평균 저장
title_age_mean =  sheep.groupby(['Title'])['Age'].mean()
title_age_mean

Title
Master     4.57
Miss      21.77
Mr        32.37
Mrs       35.90
Rare      42.38
Name: Age, dtype: float64

In [18]:
sheep['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Rare'], dtype=object)

In [19]:
# 'Age'컬럼의 결측치를 위의 경칭별 나이평균으로 채워줌
# Title의 values.unique()으로 반복문 돌려서
# 차례대로 v에 대입
for v in sheep['Title'].unique():
    # 전체 데이터에서 행 조회한거 근데 조건은 => ('Age'컬럼의 결측치 & v와 값이 같은 'Title'컬럼 values , 'Age'컬럼의 values)
    # =(같다) title_age_mean[v] (경칭별 나이 평균)        ***** 반복문이므로 인덱싱 경칭
    sheep.loc[sheep.Age.isnull() & (sheep.Title == v), 'Age'] = title_age_mean[v]


### 결측치 정리 끝

In [20]:
sheep.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
Title       0
dtype: int64

In [21]:
#@title 불필요한 컬럼을 정리하고 저장해보자.
fsheep = sheep.drop(columns=['Name', 'Ticket', 'Title'])
fsheep.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.28,C
3,1,3,female,26.0,0,0,7.92,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


In [22]:
# get_dummies 활용 범주형 변수 전환, 첫번째 컬럼 삭제까지
fsheep = pd.get_dummies(fsheep,
                          columns = ['Sex', 'Embarked'], drop_first=True)
fsheep.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,22.0,1,0,7.25,1,0,1
2,1,1,38.0,1,0,71.28,0,0,0
3,1,3,26.0,0,0,7.92,0,0,1
4,1,1,35.0,1,0,53.1,0,0,1
5,0,3,35.0,0,0,8.05,1,0,1


# 스케일링 해보기
#### 스케일링 종류
|종류|설명
|-|-|
|표준화 스케일링 Standarad Scaling|평균이 0이 되고, 표준편차가 1이 되도록 데이터를 고르게 분포시키는 데 사용|
|로버스트 스케일링 Robust Scaling|데이터에 아웃라이어가 존재하고, 그 영향력을 그대로 유지하고 싶을 때 사용|
|최소-최대 스케일링 Min-Max Scaling|데이터 분포의 특성을 최대한 그대로 유지하고 싶을 때 사용
|정규화 스케일링Normalizer Scaling|행 기준의 스케일링이 필요할 때 사용하나, 실제로 거의 사용하지 않음

In [23]:
# 스케일링 import 하기
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [29]:
#@title 표준화 스케일링
st_scaler = StandardScaler()
st_scaler.fit(fsheep)
sheep_st_scaled = st_scaler.transform(fsheep)
sheep_st_scaled

array([[-0.78927234,  0.82737724, -0.58440196, ...,  0.73769513,
        -0.30756234,  0.61583843],
       [ 1.2669898 , -1.56610693,  0.62131336, ..., -1.35557354,
        -0.30756234, -1.62380254],
       [ 1.2669898 ,  0.82737724, -0.28297313, ..., -1.35557354,
        -0.30756234,  0.61583843],
       ...,
       [-0.78927234,  0.82737724, -0.60143475, ..., -1.35557354,
        -0.30756234,  0.61583843],
       [ 1.2669898 , -1.56610693, -0.28297313, ...,  0.73769513,
        -0.30756234, -1.62380254],
       [-0.78927234,  0.82737724,  0.16917012, ...,  0.73769513,
         3.25137334, -1.62380254]])

In [27]:
# 소숫점 아래 3자리 보기로 변경
pd.options.display.float_format = '{: .3f}'.format
# DataFrame화 하기
sheep_st_df = pd.DataFrame(sheep_st_scaled, columns = fsheep.columns)
sheep_st_df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,-0.789,0.827,-0.584,0.433,-0.474,-0.502,0.738,-0.308,0.616
1,1.267,-1.566,0.621,0.433,-0.474,0.787,-1.356,-0.308,-1.624
2,1.267,0.827,-0.283,-0.475,-0.474,-0.489,-1.356,-0.308,0.616
3,1.267,-1.566,0.395,0.433,-0.474,0.421,-1.356,-0.308,0.616
4,-0.789,0.827,0.395,-0.475,-0.474,-0.486,0.738,-0.308,0.616
...,...,...,...,...,...,...,...,...,...
886,-0.789,-0.369,-0.208,-0.475,-0.474,-0.387,0.738,-0.308,0.616
887,1.267,-1.566,-0.810,-0.475,-0.474,-0.044,-1.356,-0.308,0.616
888,-0.789,0.827,-0.601,0.433,2.009,-0.176,-1.356,-0.308,0.616
889,1.267,-1.566,-0.283,-0.475,-0.474,-0.044,0.738,-0.308,-1.624


In [None]:
sheep_st_df.describe()

In [31]:
#@title 로버스트 스케일링
rb_scaler = RobustScaler()
rb_scalded = rb_scaler.fit_transform(fsheep)
sheep_rb_df = pd.DataFrame(rb_scalded, columns = fsheep.columns)
sheep_rb_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,0.0,-0.566,1.0,0.0,-0.312,0.0,0.0,0.0
1,1.0,-2.0,0.566,1.0,0.0,2.461,-1.0,0.0,-1.0
2,1.0,0.0,-0.283,0.0,0.0,-0.283,-1.0,0.0,0.0
3,1.0,-2.0,0.354,1.0,0.0,1.674,-1.0,0.0,0.0
4,0.0,0.0,0.354,0.0,0.0,-0.277,0.0,0.0,0.0


In [32]:
sheep_rb_df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.384,-0.691,-0.017,0.523,0.382,0.769,-0.352,0.086,-0.275
std,0.487,0.836,0.94,1.103,0.806,2.152,0.478,0.281,0.447
min,0.0,-2.0,-2.094,0.0,0.0,-0.626,-1.0,0.0,-1.0
25%,0.0,-1.0,-0.582,0.0,0.0,-0.283,-1.0,0.0,-1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.418,1.0,0.0,0.717,0.0,0.0,0.0
max,1.0,0.0,3.54,8.0,6.0,21.563,0.0,1.0,0.0


In [33]:
#@title 최소최대스케일링
mm_scaler = MinMaxScaler()
mm_scaled = mm_scaler.fit_transform(fsheep)
sheep_mm_df = pd.DataFrame(mm_scaled, columns = fsheep.columns)
sheep_mm_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.384,0.654,0.369,0.065,0.064,0.063,0.648,0.086,0.725
std,0.487,0.418,0.167,0.138,0.134,0.097,0.478,0.281,0.447
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.5,0.268,0.0,0.0,0.015,0.0,0.0,0.0
50%,0.0,1.0,0.372,0.0,0.0,0.028,1.0,0.0,1.0
75%,1.0,1.0,0.446,0.125,0.0,0.061,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
sheep_mm_df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.384,0.654,0.369,0.065,0.064,0.063,0.648,0.086,0.725
std,0.487,0.418,0.167,0.138,0.134,0.097,0.478,0.281,0.447
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.5,0.268,0.0,0.0,0.015,0.0,0.0,0.0
50%,0.0,1.0,0.372,0.0,0.0,0.028,1.0,0.0,1.0
75%,1.0,1.0,0.446,0.125,0.0,0.061,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
#@title 훈련셋과 시험셋 분리

# train_test_split 임포트
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    fsheep.drop('Survived', axis=1),
    fsheep.Survived, test_size=0.2, random_state=100
)

In [36]:
#@title 최대,최소스케일링으로 train, test나눠서 스케일링
mm_scaler = MinMaxScaler()
X_train_scaled = mm_scaler.fit_transform(X_train)
X_test_scaled = mm_scaler.transform(X_test)

In [37]:
#@title 모델링 및 예측 평가
# import
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
pred = knn.predict(X_test_scaled)
pred

array([1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1])

In [38]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.8044692737430168

In [39]:
#@title 함수 만들어서 하이터파라미터 튜닝
def tuning(n):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train_scaled, y_train)
    pred = knn.predict(X_test_scaled)
    score = accuracy_score(y_test, pred)
    return score

In [40]:
tuning(7)

0.8212290502793296

In [41]:
scores = [(i, tuning(i)) for i in range(1, 21)]
scores

[(1, 0.7653631284916201),
 (2, 0.7877094972067039),
 (3, 0.7821229050279329),
 (4, 0.7821229050279329),
 (5, 0.8044692737430168),
 (6, 0.8044692737430168),
 (7, 0.8212290502793296),
 (8, 0.8324022346368715),
 (9, 0.8379888268156425),
 (10, 0.8212290502793296),
 (11, 0.8379888268156425),
 (12, 0.8044692737430168),
 (13, 0.8100558659217877),
 (14, 0.8100558659217877),
 (15, 0.8100558659217877),
 (16, 0.8100558659217877),
 (17, 0.8044692737430168),
 (18, 0.8044692737430168),
 (19, 0.8044692737430168),
 (20, 0.8100558659217877)]