# 머신러닝 복습


# 1.환경준비

* 라이브러리 Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.preprocessing import  MinMaxScaler

# 2.Classification

## (1) 데이터 전처리
* 데이터 준비
* 가변수화
* 스케일링(필요하다면)
* 데이터 분할

### 1) 데이터 준비

In [2]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/titanic.1.csv'
data = pd.read_csv(path, usecols = ['Survived','Pclass','Title','Sex','Age','Fare','Embarked','AgeGroup'])
data.loc[data['AgeGroup']== 'Age_80', 'AgeGroup'] = 'Age_70'
data.head()

Unnamed: 0,Survived,Pclass,Title,Sex,Age,Fare,Embarked,AgeGroup
0,0,3,Mr,male,22.0,7.25,S,Age_20
1,1,1,Mrs,female,38.0,71.2833,C,Age_30
2,1,3,Miss,female,26.0,7.925,S,Age_20
3,1,1,Mrs,female,35.0,53.1,S,Age_30
4,0,3,Mr,male,35.0,8.05,S,Age_30


In [3]:
target = 'Survived'
x = data.drop([target], axis=1)
y = data.loc[:, target]

### 2) 데이터분할

In [4]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.4, random_state = 10)

In [5]:
validation = x_val.copy()
validation['Survived'] = y_val

### 3) 가변수화

In [6]:
cat_cols = ['Title','Sex','Embarked','AgeGroup']
x_train = pd.get_dummies(x_train, columns = cat_cols, drop_first = True)
x_val = pd.get_dummies(x_val, columns = cat_cols, drop_first = True)

### 4) Scaling

In [7]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

## (2) 모델링
* 필요한 함수들 불러오기
* 모델 선언
* 학습
* 예측
* 성능 검증

In [8]:
# 함수들 불러오기
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

* 모델1

In [9]:
model1 = LogisticRegression()
model1.fit(x_train, y_train)
pred1 = model1.predict(x_val)

print(confusion_matrix(y_val, pred1))
print(classification_report(y_val, pred1))

[[185  44]
 [ 32  96]]
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       229
           1       0.69      0.75      0.72       128

    accuracy                           0.79       357
   macro avg       0.77      0.78      0.77       357
weighted avg       0.79      0.79      0.79       357



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


* 모델2

In [10]:
model2 = KNeighborsClassifier(n_neighbors = 3)
model2.fit(x_train_s, y_train)
pred2 = model2.predict(x_val_s)

print(confusion_matrix(y_val, pred2))
print(classification_report(y_val, pred2))

[[194  35]
 [ 32  96]]
              precision    recall  f1-score   support

           0       0.86      0.85      0.85       229
           1       0.73      0.75      0.74       128

    accuracy                           0.81       357
   macro avg       0.80      0.80      0.80       357
weighted avg       0.81      0.81      0.81       357



* 모델3

In [11]:
model3 = RandomForestClassifier(n_estimators = 200, max_features = 10, max_depth = 4)
model3.fit(x_train, y_train)
pred3 = model3.predict(x_val)

print(confusion_matrix(y_val, pred3))
print(classification_report(y_val, pred3))

[[202  27]
 [ 38  90]]
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       229
           1       0.77      0.70      0.73       128

    accuracy                           0.82       357
   macro avg       0.81      0.79      0.80       357
weighted avg       0.82      0.82      0.82       357



* 모델4

In [12]:
model4 = XGBClassifier()
model4.fit(x_train, y_train)
pred4 = model4.predict(x_val)

print(confusion_matrix(y_val, pred4))
print(classification_report(y_val, pred4))

[[187  42]
 [ 33  95]]
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       229
           1       0.69      0.74      0.72       128

    accuracy                           0.79       357
   macro avg       0.77      0.78      0.77       357
weighted avg       0.79      0.79      0.79       357



## (3) 결과 저장하고 csv로 내보내기 

In [13]:
pred = [pred1, pred2, pred3, pred4]
model = ['pred_LR','pred_KNN','pred_RF','pred_XGB']
for i in range(4) :
    colname = model[i]
    validation[colname] = pred[i]

validation.head() 

Unnamed: 0,Pclass,Title,Sex,Age,Fare,Embarked,AgeGroup,Survived,pred_LR,pred_KNN,pred_RF,pred_XGB
590,3,Mr,male,35.0,7.125,S,Age_30,0,0,0,0,0
131,3,Mr,male,20.0,7.05,S,Age_20,0,0,0,0,0
628,3,Mr,male,26.0,7.8958,S,Age_20,0,0,0,0,0
195,1,Miss,female,58.0,146.5208,C,Age_50,1,1,1,1,1
230,1,Mrs,female,35.0,83.475,S,Age_30,1,1,1,1,1


In [14]:
validation.to_csv('result.csv', index = False)