1. 데이터 분석
    1. 표로 결과를 만들기
    2. 데이터를 시각화하기
2. 전처리
    1. 결측치제거
    2. 결측치메우기
    3. 라벨인코딩
    4. 원핫인코딩
    5. 표준화
    6. 데이터 선
3. 훈련 
    1. 퍼셉트론
    2. 로지스틱회귀
    3. KNN
    4. SVM
    5. 결정트리
    6. 랜덤포레스트
4. 모델평가
    1. 검증곡선 - 시각화
    2. k곂 교차검증
5. 최적의 파라미터 찾기
   


In [2]:
import pandas as pd
import numpy as np
import copy

In [3]:
df_train_raw = pd.read_csv("train.csv")
df_test_raw = pd.read_csv("test.csv")

# 파일에서 읽어온 값들을 깊은 복사로 새로 객체를 만들어서 문제가 생기면 코드를 위의 원본데이터를 재활용한다.
df_train = copy.copy(df_train_raw)
df_test = copy.copy(df_test_raw)

In [4]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
print("---train set data---")
print(df_train.isnull().sum())
print("데이터 shape :", end='')
print(df_train.shape)
print()


---train set data---
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
데이터 shape :(891, 12)



In [7]:
print("---test set data---")
print(df_test.isnull().sum())
print("데이터 shape :", end='')
print(df_test.shape)
print()

---test set data---
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
데이터 shape :(418, 11)



### 결측치

#### 제거할 열들
- age : 177, 86 중요한 정보가 있는거 같긴는 하나, 결측치가 너무많다.
- Embarked : 687, 327 결측치가 너무 많다.

#### 제거할 행들
- test셋의 데이터들은 어떤 행도 제거되선 안된다.
- train셋의 Embarked 2개의 행을 없에야한다.

#### 채워넣을 열들
test: Fare 평균값으로 채워넣는다.

In [8]:
# 'Age', 'Cabin' 열 제거 
df_train = df_train.drop(columns=[ 'Age', 'Cabin'])
df_test = df_test.drop(columns=[ 'Age', 'Cabin'])

In [9]:
# train의 결측치가 있는 행들을제거
df_train = df_train.dropna()
print("---train set data---")
print(df_train.isnull().sum())
print("데이터 shape :", end='')
print(df_train.shape)
print()

---train set data---
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64
데이터 shape :(889, 10)



In [10]:
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mean())
print("---test set data---")
print(df_test.isnull().sum())
print("데이터 shape :", end='')
print(df_test.shape)
print()

---test set data---
PassengerId    0
Pclass         0
Name           0
Sex            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64
데이터 shape :(418, 9)



In [11]:
df_embarked_raw1 = copy.copy(df_train['Embarked'])
df_embarked_raw1.value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

## 판다스의 mapping을 이용해서 라벨인코딩하기

#### 딕셔너리를 param으로 사용

In [12]:
embarked_mapping = {
    'S' : 1,
    'C' : 2,
    'Q' : 3
}

In [13]:
df_embarked_raw1.map(embarked_mapping)

0      1
1      2
2      1
3      1
4      1
      ..
886    1
887    1
888    1
889    2
890    3
Name: Embarked, Length: 889, dtype: int64

#### 사용자 정의 함수를 파라미터로 사용해서 라벨인코딩

In [14]:
def embarked_mapping_F(x):
    if x == 'S':
        return 1
    elif x == 'C':
        return 2
    else:
        return 3

In [15]:
df_embarked_raw1.map(embarked_mapping_F)

0      1
1      2
2      1
3      1
4      1
      ..
886    1
887    1
888    1
889    2
890    3
Name: Embarked, Length: 889, dtype: int64

In [16]:
from sklearn.preprocessing import LabelEncoder

# 라벨 인코더 객체 생성
label_encoder = LabelEncoder()

# 라벨 인코딩 적용
df_embarked_raw1 = copy.copy(df_train['Embarked'])
label_encoder.fit_transform(df_embarked_raw1)

array([2, 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2,
       1, 2, 2, 2, 0, 2, 1, 2, 0, 0, 1, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0,
       1, 2, 1, 1, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2,
       2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2,
       0, 2, 2, 0, 2, 1, 2, 0, 2, 2, 2, 0, 2, 2, 0, 1, 2, 0, 2, 0, 2, 2,
       2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2,
       0, 2, 2, 2, 0, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 0, 0, 1, 2, 1,
       2, 2, 2, 2, 0, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 1, 0, 2, 2, 0, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2,
       2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 1, 2, 2, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 0, 2,
       2, 2, 1, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 1, 1,

## 판다스 멤버 함수로 원핫 인코딩하고 원래 데이터셋에 덧붙이기

In [17]:
df_embarked_raw2 = copy.copy(df_train['Embarked'])
df_embarked_raw2.value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [18]:
# True, False로 분류되지만 그래도 괜찮다.
pd.get_dummies(df_embarked_raw2)

Unnamed: 0,C,Q,S
0,False,False,True
1,True,False,False
2,False,False,True
3,False,False,True
4,False,False,True
...,...,...,...
886,False,False,True
887,False,False,True
888,False,False,True
889,True,False,False


## 사이킷런으로 원-핫 인코딩후, 원래 데이터프레임에 덧붙이기

In [19]:
from sklearn.preprocessing import OneHotEncoder
# multicollinearity guard for the OneHotEncoder

color_ohe = OneHotEncoder()
encoded_array = color_ohe.fit_transform(df_embarked_raw2.values.reshape(-1, 1)).toarray()
encoded_df = pd.DataFrame(encoded_array, index=df_train.index, columns=color_ohe.categories_[0])

In [20]:
encoded_df.head(5)

Unnamed: 0,C,Q,S
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [21]:
df_train_copy = copy.copy(df_train)

In [22]:
df_train_copy[['C', 'Q', 'S']] = encoded_df
df_train_copy.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,S,0.0,0.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C,1.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,S,0.0,0.0,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,S,0.0,0.0,1.0
4,5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.05,S,0.0,0.0,1.0


In [23]:
df_train_ex = copy.copy(df_train['Fare'])

In [24]:
df_train_ex.head(5)

0     7.2500
1    71.2833
2     7.9250
3    53.1000
4     8.0500
Name: Fare, dtype: float64

In [None]:
df_train_ex  = (df_train_ex - df_train_ex.mean()) / df_train_ex.std()
df_train_ex.head(5)

0   -0.499958
1    0.788503
2   -0.486376
3    0.422623
4   -0.483861
Name: Fare, dtype: float64