### 지도학습

#### 분류 - 이진분류

In [55]:
import pandas as pd

In [56]:
df_TFD = pd.read_csv('../../../datasets/TitanicFromDisaster_train.csv')
df_TFD[:2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


#### 전처리

In [57]:
df_TFD.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [58]:
df_TFD_extract = df_TFD[['Survived', 'Pclass','Age']]
df_TFD_extract

Unnamed: 0,Survived,Pclass,Age
0,0,3,22.0
1,1,1,38.0
2,1,3,26.0
3,1,1,35.0
4,0,3,35.0
...,...,...,...
886,0,2,27.0
887,1,1,19.0
888,0,3,
889,1,1,26.0


In [59]:
df_TFD_extract.isnull().sum()

Survived      0
Pclass        0
Age         177
dtype: int64

In [60]:
df_TFD_extract_preprocess = df_TFD_extract.dropna()
df_TFD_extract_preprocess

Unnamed: 0,Survived,Pclass,Age
0,0,3,22.0
1,1,1,38.0
2,1,3,26.0
3,1,1,35.0
4,0,3,35.0
...,...,...,...
885,0,3,39.0
886,0,2,27.0
887,1,1,19.0
889,1,1,26.0


#### 정형화 단계

In [61]:
from sklearn.model_selection import train_test_split
target = df_TFD_extract_preprocess['Survived']
features = df_TFD_extract_preprocess[['Pclass','Age']]

In [62]:
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=111)
features_train.shape, target_train.shape, features_test.shape, target_test.shape

((535, 2), (535,), (179, 2), (179,))

In [63]:
target_train = df_TFD_extract_preprocess['Survived']
features_train = df_TFD_extract_preprocess[['Pclass','Age']]
target_train.shape, features_train.shape

((714,), (714, 2))

#### Model학습

In [64]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(features_train,target_train)

In [65]:
model.coef_, model.intercept_

(array([[-1.22653571, -0.04149665]]), array([3.532956]))

#### 예측

In [66]:
df_TFD_extract_preprocess[10:15]

Unnamed: 0,Survived,Pclass,Age
11,1,1,58.0
12,0,3,20.0
13,0,3,39.0
14,0,3,14.0
15,1,2,55.0


In [77]:
df_TFD_extract_preprocess.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  714 non-null    int64  
 1   Pclass    714 non-null    int64  
 2   Age       714 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 22.3 KB


In [67]:
model.predict(features_train[10:20])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [68]:
model.predict_proba(features_train[10:20])

array([[0.52507531, 0.47492469],
       [0.72642991, 0.27357009],
       [0.85383733, 0.14616267],
       [0.67427932, 0.32572068],
       [0.768957  , 0.231043  ],
       [0.55716013, 0.44283987],
       [0.80737568, 0.19262432],
       [0.59206249, 0.40793751],
       [0.58200305, 0.41799695],
       [0.68332637, 0.31667363]])

#### 평가

In [69]:
target_train_predict = model.predict(features_train)
target_train_predict.shape # target_train.shape 동일

(714,)

In [70]:
from sklearn.metrics import accuracy_score # 정확도

In [71]:
accuracy_score(target_train,target_train_predict) # 교내 시험

0.696078431372549

In [72]:
target_test_predict = model.predict(features_test)
target_test_predict.shape # target_test.shape 동일

(179,)

In [73]:
accuracy_score(target_test, target_test_predict)

0.6703910614525139

In [74]:
from sklearn.metrics import classification_report

In [75]:
print(classification_report(target_train,target_train_predict))

              precision    recall  f1-score   support

           0       0.71      0.82      0.76       424
           1       0.66      0.52      0.58       290

    accuracy                           0.70       714
   macro avg       0.69      0.67      0.67       714
weighted avg       0.69      0.70      0.69       714



In [76]:
print(classification_report(target_test,target_test_predict))

              precision    recall  f1-score   support

           0       0.73      0.76      0.74       112
           1       0.56      0.52      0.54        67

    accuracy                           0.67       179
   macro avg       0.65      0.64      0.64       179
weighted avg       0.67      0.67      0.67       179

