### 지도학습

#### 분류 - 이진분류

In [1]:
import pandas as pd

In [2]:
df_TFD = pd.read_csv('../../../datasets/TitanicFromDisaster_train.csv')
df_TFD[:2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


#### 전처리

In [3]:
df_TFD.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
df_TFD_extract = df_TFD[['Survived', 'Pclass','Age']]
df_TFD_extract

Unnamed: 0,Survived,Pclass,Age
0,0,3,22.0
1,1,1,38.0
2,1,3,26.0
3,1,1,35.0
4,0,3,35.0
...,...,...,...
886,0,2,27.0
887,1,1,19.0
888,0,3,
889,1,1,26.0


In [5]:
df_TFD_extract.isnull().sum()

Survived      0
Pclass        0
Age         177
dtype: int64

In [6]:
df_TFD_extract_preprocess = df_TFD_extract.dropna()
df_TFD_extract_preprocess

Unnamed: 0,Survived,Pclass,Age
0,0,3,22.0
1,1,1,38.0
2,1,3,26.0
3,1,1,35.0
4,0,3,35.0
...,...,...,...
885,0,3,39.0
886,0,2,27.0
887,1,1,19.0
889,1,1,26.0


#### Scaling & Encoding

##### Encoding with OneHotEncoding

In [7]:
df_TFD_extract_preprocess['Pclass'].value_counts()

3    355
1    186
2    173
Name: Pclass, dtype: int64

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
oneHotEncoder = OneHotEncoder()
oneHotEncoder.fit(df_TFD_extract_preprocess[['Pclass']]) # 해당 항목 학습한 것.

In [10]:
columns_name = oneHotEncoder.categories_

In [11]:
# oneHotEncoder.transform(df_TFD_extract_preprocess[['Pclass']]).toarray() 값 확인용 .toarray(): numpy변경
encoded_data = oneHotEncoder.transform(df_TFD_extract_preprocess[['Pclass']]).toarray()
encoded_data.shape

(714, 3)

In [12]:
# 병합을 위한 numpy array to DataFrame
df_encoded_data = pd.DataFrame(data=encoded_data, columns=oneHotEncoder.get_feature_names_out(['Pclass']))
df_encoded_data[:2]

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,0.0,0.0,1.0
1,1.0,0.0,0.0


In [13]:
df_encoded_data.index, df_encoded_data.shape

(RangeIndex(start=0, stop=714, step=1), (714, 3))

In [14]:
df_TFD_extract_preprocess.isnull().sum()

Survived    0
Pclass      0
Age         0
dtype: int64

In [15]:
# df_encoded_data = pd.get_dummies(df_TFD_extract_preprocess['Pclass'],prefix='Pclass') # prefix='Pclass' 컬럼이름 추가
# df_encoded_data

In [16]:
df_TFD_extract_preprocess= pd.concat([df_TFD_extract_preprocess.reset_index(drop=True),df_encoded_data.reset_index(drop=True)],axis=1)
df_TFD_extract_preprocess[:2]

Unnamed: 0,Survived,Pclass,Age,Pclass_1,Pclass_2,Pclass_3
0,0,3,22.0,0.0,0.0,1.0
1,1,1,38.0,1.0,0.0,0.0


In [17]:
df_TFD_extract_preprocess.columns

Index(['Survived', 'Pclass', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3'], dtype='object')

In [18]:
target = df_TFD_extract_preprocess['Survived']

In [19]:
features = df_TFD_extract_preprocess.drop(columns=['Pclass','Survived'])

In [20]:
features.columns

Index(['Age', 'Pclass_1', 'Pclass_2', 'Pclass_3'], dtype='object')

#### MinMaxScaler

In [21]:
from sklearn.preprocessing import MinMaxScaler

In [22]:
minMaxScaler = MinMaxScaler() # 인스턴스화
features = minMaxScaler.fit_transform(features)
features.shape

(714, 4)

#### 정형화 단계

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=111)
features_train.shape, target_train.shape, features_test.shape, target_test.shape

((535, 4), (535,), (179, 4), (179,))

In [25]:
# target_train = df_TFD_extract_preprocess['Survived']
# features_train = df_TFD_extract_preprocess[['Pclass','Age']]
# target_train.shape, features_train.shape

#### Model학습

In [26]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(features_train,target_train)

In [27]:
model.coef_, model.intercept_
#  (array([[-1.22653571, -0.04149665]]), array([3.532956])) 정규화이전

(array([[-2.59255724,  1.11141847,  0.02997489, -1.14146766]]),
 array([0.86447724]))

#### 예측

In [28]:
df_TFD_extract_preprocess[10:15]

Unnamed: 0,Survived,Pclass,Age,Pclass_1,Pclass_2,Pclass_3
10,1,1,58.0,1.0,0.0,0.0
11,0,3,20.0,0.0,0.0,1.0
12,0,3,39.0,0.0,0.0,1.0
13,0,3,14.0,0.0,0.0,1.0
14,1,2,55.0,0.0,1.0,0.0


In [40]:
df_TFD_extract_preprocess.shape

(714, 6)

In [29]:
df_TFD_extract_preprocess.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714 entries, 0 to 713
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  714 non-null    int64  
 1   Pclass    714 non-null    int64  
 2   Age       714 non-null    float64
 3   Pclass_1  714 non-null    float64
 4   Pclass_2  714 non-null    float64
 5   Pclass_3  714 non-null    float64
dtypes: float64(4), int64(2)
memory usage: 33.6 KB


In [30]:
model.predict(features_train[10:20])

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

In [31]:
model.predict_proba(features_train[10:20])

array([[0.59715375, 0.40284625],
       [0.63564456, 0.36435544],
       [0.75219168, 0.24780832],
       [0.47500723, 0.52499277],
       [0.73984819, 0.26015181],
       [0.80785242, 0.19214758],
       [0.75821419, 0.24178581],
       [0.72711291, 0.27288709],
       [0.70050665, 0.29949335],
       [0.58929257, 0.41070743]])

#### 평가

In [32]:
target_train_predict = model.predict(features_train)
target_train_predict.shape # target_train.shape 동일

(535,)

In [33]:
from sklearn.metrics import accuracy_score # 정확도

In [34]:
accuracy_score(target_train,target_train_predict) # 교내 시험

0.708411214953271

In [35]:
target_test_predict = model.predict(features_test)
target_test_predict.shape # target_test.shape 동일

(179,)

In [36]:
accuracy_score(target_test, target_test_predict)

0.6480446927374302

In [37]:
from sklearn.metrics import classification_report

In [38]:
print(classification_report(target_train,target_train_predict))

              precision    recall  f1-score   support

           0       0.72      0.82      0.77       312
           1       0.69      0.55      0.61       223

    accuracy                           0.71       535
   macro avg       0.70      0.69      0.69       535
weighted avg       0.71      0.71      0.70       535



In [39]:
print(classification_report(target_test,target_test_predict))

              precision    recall  f1-score   support

           0       0.72      0.71      0.71       112
           1       0.53      0.55      0.54        67

    accuracy                           0.65       179
   macro avg       0.63      0.63      0.63       179
weighted avg       0.65      0.65      0.65       179



#### 서비스

In [42]:
df_TFD_extract[100:103]

Unnamed: 0,Survived,Pclass,Age
100,0,3,28.0
101,0,3,
102,0,1,21.0


In [45]:
encoder_pclass_ = oneHotEncoder.transform([[3]]).toarray()
encoder_pclass_, encoder_pclass_.flatten()



(array([[0., 0., 1.]]), array([0., 0., 1.]))

In [46]:
#[[28.0, [0., 0., 1.]]]
import numpy as np
inputs_data = np.concatenate(([28.0], encoder_pclass_.flatten()))
inputs_data

array([28.,  0.,  0.,  1.])

In [48]:
model.predict([inputs_data])  # 목표변수 얻기 

array([0], dtype=int64)