<a href="https://colab.research.google.com/github/DamnTT/deep-learning/blob/main/deep_hw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
# 繪圖相關套件
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.gridspec as gridspec
import seaborn as sns
plt.style.use( 'ggplot' ) 
# 標籤編碼(Label)、獨熱編碼(OneHot)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 
# confusion matrix
from sklearn import metrics
from IPython.display import display
import warnings
warnings.filterwarnings( 'ignore' )
#上傳資料
from google.colab import files
uploaded = files.upload()
#匯入資料
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
compare = pd.read_csv('gender_submission.csv')
submit = compare

Saving gender_submission.csv to gender_submission (3).csv
Saving train.csv to train (3).csv
Saving test.csv to test (3).csv


In [None]:
#欄位型態
# 定義判別欄位型態的函數
def Col_Types( Data ):
    Column_Types = Data.dtypes.to_frame().reset_index()   # 判別每個欄位的型態 
    Column_Types.columns = ['ColumnName','Type']
    Column_Types.sort_values( by='Type', inplace=True ) 
    return Column_Types

# 缺漏植
# 定義用來統計欄位缺漏值總數的函數
def Missing_Counts( Data ) : 
    missing = Data.isnull().sum()  # 計算欄位中缺漏值的數量 
    missing = missing[ missing>0 ]
    missing.sort_values( inplace=True ) 
    
    Missing_Count = pd.DataFrame( { 'ColumnName':missing.index, 'MissingCount':missing.values } )  # Convert Series to DataFrame
    Missing_Count[ 'Percentage(%)' ] = Missing_Count['MissingCount'].apply( lambda x:round(x/Data.shape[0]*100,2) )
    return  Missing_Count

#資料分析
# 合併train及test的資料 
df_data  = df_train.append( df_test )
f_data = compare

#生還者比例
Survived_Counts = df_data['Survived'].value_counts().reset_index()
Survived_Counts.columns = ['Survived','Counts']


#相關係數
# Survied 與其他欄位間的相關係數
Corr_Matrix = df_train.corr()  # 計算相關係數
Corr = Corr_Matrix.loc['Survived',:].sort_values()[:-1]
Corr = pd.DataFrame({ 'Survived':Corr })

#生存率
# Sex性別、Pclass票務艙、Embarked登船港口、SibSp兄弟姊妹配偶人數、Parch父母子女人數
selected_cols = ['Sex','Pclass','Embarked','SibSp','Parch']

for col in selected_cols:
    l = ['Survived']
    l.append(col) 
    Survival_Rate = df_data[l].groupby(by=col).mean().round(4).reset_index()
    Survival_Rate.columns = [col,'Survival Rate(%)']
    Survival_Rate['Survival Rate(%)'] = Survival_Rate['Survival Rate(%)'].map( lambda x:x*100 )
    # display( Survival_Rate )   

# 創造新的特徵變數：家庭人數(Family_Size)
df_data['Family_Size'] = df_data['SibSp'] + df_data['Parch'] + 1

Survival_Rate = df_data[['Family_Size','Survived']].groupby(by=['Family_Size']).agg(np.mean)*100
Survival_Rate.columns = ['Survival Rate(%)']
Survival_Rate.reset_index()

df_data[ 'Family_Class' ] = np.nan

df_data.loc[ df_data.Family_Size==0, 'Family_Class' ] = 2
df_data.loc[ (df_data.Family_Size>=1) & (df_data.Family_Size<=3), 'Family_Class' ] = 3
df_data.loc[ (df_data.Family_Size>=4) & (df_data.Family_Size<=6), 'Family_Class' ] = 2
df_data.loc[ (df_data.Family_Size>=7), 'Family_Class' ] = 1

# Sex & Pclass 
Survival_Rate = df_data[['Sex','Pclass','Survived']].groupby(by=['Sex','Pclass']).agg(np.mean)*100
Survival_Rate.columns = ['Survival Rate(%)']
Survival_Rate.reset_index()
df_data[ 'Sex_Pclass' ] = np.nan
df_data.loc[ (df_data.Sex=='female') & (df_data.Pclass==1), 'Sex_Pclass' ] = 2
df_data.loc[ (df_data.Sex=='female') & (df_data.Pclass==2), 'Sex_Pclass' ] = 3
df_data.loc[ (df_data.Sex=='female') & (df_data.Pclass==3), 'Sex_Pclass' ] = 3
df_data.loc[ (df_data.Sex=='male') & (df_data.Pclass==1), 'Sex_Pclass' ] = 1
df_data.loc[ (df_data.Sex=='male') & (df_data.Pclass==2), 'Sex_Pclass' ] = 1
df_data.loc[ (df_data.Sex=='male') & (df_data.Pclass==3), 'Sex_Pclass' ] = 2


#姓名
# Method 1: split()
df_data['Title'] = df_data.Name.str.split(', ', expand=True)[1]
df_data['Title'] = df_data.Title.str.split('.', expand=True)[0]
df_data['Title'].unique()

# Method 2: 正規表示法(Regular Expression)
import re

regex = re.compile( ' ([A-Za-z]+)\.' )  
df_data['Title'] = df_data.Name.map( lambda x:regex.search(x)[0] )
# Dropping the first and the last words
df_data['Title'] = df_data.Title.map( lambda x:x[1:][:-1] )  
df_data['Title'].unique()
df_data['Title'] = df_data.Title.replace( ['Don','Rev','Dr','Major','Lady','Sir','Col','Capt','Countess','Jonkheer','Dona'], 'Rare' )
df_data['Title'] = df_data.Title.replace( ['Ms','Mlle'], 'Miss' )
df_data['Title'] = df_data.Title.replace( 'Mme', 'Mrs' )
df_data['Title'].unique()
# 刪除原始資料中的 Name 欄位 
df_data.drop( 'Name', axis=1, inplace=True )
#船票號碼
df_data['Ticket_info'] = df_data.Ticket.apply( lambda x:x.replace('.','').replace('/','').strip().split(' ')[0] if not x.isdigit() else 'X')
df_data['Ticket_info'].unique()
# 刪除原始資料中的 Ticket 欄位 
df_data.drop( 'Ticket', axis=1, inplace=True )
#填補缺漏值(Missing Values)
Missing_Counts( df_data.drop('Survived', axis=1) )
#票價
df_data['Fare'].fillna( df_data.Fare.median(), inplace=True )
#登船港口(Embarked) : 填補次數最多的港口 'S'
# 計算 Embarked 欄位中每個相異值的次數
# display( df_data['Embarked'].value_counts() )
# 填補 Embarked 欄位的缺漏值
df_data['Embarked'].fillna( 'S', inplace=True )
#年齡
# 新增標註 Age 欄位是否為缺漏值的欄位(有缺漏標為0)
df_data['isAge'] = df_data['Age'].isnull().map( lambda x:0 if x==True else 1 )
# 分別觀察 Age 與 Sex、Pclass 的缺漏值分布狀況

# 計算每個 Title 的年齡平均值及中位數
Age_Mean = df_data[['Title','Age']].groupby( by=['Title'] ).mean()
Age_Median = df_data[['Title','Age']].groupby( by=['Title'] ).median()
Age_Mean.columns = ['Age Mean']
Age_Median.columns = ['Age Median']
Age_Mean.reset_index( inplace=True )
Age_Median.reset_index( inplace=True )

# 利用每個 Title 的年齡平均數，填補每個 Title 所對應 Age 的缺漏值
df_data.loc[(df_data.Age.isnull())&(df_data.Title=='Master'),'Age'] = Age_Mean.loc[Age_Mean.Title=='Master','Age Mean'][0]
df_data.loc[(df_data.Age.isnull())&(df_data.Title=='Miss'),'Age'] = Age_Mean.loc[Age_Mean.Title=='Miss','Age Mean'][1]
df_data.loc[(df_data.Age.isnull())&(df_data.Title=='Mr'),'Age'] = Age_Mean.loc[Age_Mean.Title=='Mr','Age Mean'][2]
df_data.loc[(df_data.Age.isnull())&(df_data.Title=='Mrs'),'Age'] = Age_Mean.loc[Age_Mean.Title=='Mrs','Age Mean'][3]
df_data.loc[(df_data.Age.isnull())&(df_data.Title=='Rare'),'Age'] = Age_Mean.loc[Age_Mean.Title=='Rare','Age Mean'][4]

# 刪除 isAge 欄位 
df_data.drop( 'isAge', axis=1, inplace=True )

# 乘客年齡是否未滿17歲(是：1；否：0)
df_data[ 'is_Age_17' ] = (df_data.Age<17)*1

# 觀察 Cabin 中的種類
df_data['Cabin'].unique()

# 取出 Cabin 中的第一個字母，如果為缺漏值，則以 NoCabin 表示
df_data['Cabin'] = df_data['Cabin'].apply( lambda x:str(x)[0] if not pd.isnull(x) else 'NoCabin' )
df_data['Cabin'].unique()
Missing_Counts( df_data.drop('Survived', axis=1) )

# 對 Fare 欄位取對數
df_data['LogFare'] = np.log1p( df_data.Fare )

# 計算 Fare 欄位各個百分位數(Percentile)
P_all = [ np.percentile( df_data.Fare, q=i ) for i in np.arange(0,101) ] 
Pth_Percentile = pd.DataFrame( { 'Q':list(range(101)), 'Value':P_all } )

# The first、second and third quartile(i,e., the 25th、50th and 75th Percentile)
Q1 = Pth_Percentile.iloc[ 25, 1 ]
Q2 = Pth_Percentile.iloc[ 50, 1 ]
Q3 = Pth_Percentile.iloc[ 75, 1 ]
IQR = Q3 - Q1

# 依照四分位數，對 Fare 欄位進行分組
Fare_bin = [ 0, Q1, Q2, Q3, Q3+1.5*IQR, df_data.Fare.max() ]
df_data[ 'Fare_Group' ] = pd.cut( df_data.Fare.values, Fare_bin )

# 計算每個分組中的資料筆數
Group_Counts = df_data[ 'Fare_Group' ].value_counts().reset_index()    
Group_Counts.columns = [ 'Fare_Group', 'Counts' ]
Group_Counts.sort_values( by='Fare_Group' )

# 刪除 Fare_Group 欄位 
df_data.drop( ['Fare','Fare_Group'], axis=1, inplace=True )

# OneHot Encoding
OneHot_Embarked = pd.get_dummies( df_data.Embarked, prefix='Embarked' )

# 合併 Embarked 編碼後的欄位
df_data = pd.concat( [ df_data, OneHot_Embarked ], axis=1 )
df_data.drop( 'Embarked', axis=1, inplace=True )

# Label Encoding
Sex_mapping = { 'male':0, 'female':1 }
df_data[ 'Sex' ] = df_data.Sex.map( Sex_mapping )

# 檢視特徵工程後的資料
# print( f'Shape of data after feature engineering = {df_data.shape}' )
for col in ['Title','Ticket_info','Cabin']:
    df_data[col] = df_data[col].astype('category').cat.codes

In [None]:
# 產生訓練集和測試集
Train = df_data[ pd.notnull(df_data.Survived) ]
Test = df_data[ pd.isnull(df_data.Survived) ]

#分類器字典
classifier_dict = dict() 

# 訓練集刪除 PassengerId 欄位；
# 測試集刪除 PassengerId 與 Survived 欄位
Train.drop( ['PassengerId'], axis=1, inplace=True )
Test.drop( ['PassengerId','Survived'], axis=1, inplace=True )

# 將測試集中的標籤欄位 Survived 單獨拆出
Y_Train = Train.Survived
X_Train = Train.drop( ['Survived'], axis=1 )
ftest = f_data.Survived

#資料轉換
tmp = np.zeros(418)
a = 0
for i in ftest:
  tmp[a] = i
  a = a+1
ftest = tmp

#求出其accuracy, precision, recall, F1 score, confusion matrix等資訊
def deep_classified(Test_pred, Classifier):
  confusion = metrics.confusion_matrix(ftest, Test_pred)
  TP = confusion[1, 1]
  TN = confusion[0, 0]
  FP = confusion[0, 1]
  FN = confusion[1, 0]
  print("This is the ",Classifier,"classifier`:")
  # Classification Error: Overall, how does the classifier predict incorrectly (Misclassification Rate)
  print("Accuracy: ",(FP + FN) / float(TP + TN + FP + FN))
  # Specificity: When the actual value is negative, how often is the prediction correct?
  # print("Specificity",TN / float(TN + FP))
  # Precision: When a positive value is predicted, how often is the prediction correct?
  precision = TP / float(TP + FP)
  print("Precision：", TP / float(TP + FP))
  # Recall: When the actual value is positive, how often is the prediction correct?
  recall = TP / float(TP + FN)
  print("Recall：",TP / float(TP + FN))
  # F1 Score
  print("F1 Score：", (2 * (precision * recall) / (precision + recall)))
  # Confusion matrix
  print("Confusion matrix：","\n",confusion)
  print("\n")

# 提交檔案
def submit_file(Test_pred, Classifier):
  submit['Survived'] = Test_pred.astype(int)
  submit.to_csv( 'Titanic_RandomForest.csv', index=False )
  # print(Classifier,"classifier" + f'預測結果：')
  print( f'預測結果：' )
  submit 

In [None]:
# Logistic Classifier
from sklearn.linear_model import LogisticRegression
LC = LogisticRegression()
LC.fit(X_Train[SelectedFeatures],Y_Train)
Test_pred_LC = LC.predict( Test[SelectedFeatures] )
classifier_dict["Logistic"] = Test_pred_LC #放入字典

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
NBC = GaussianNB()
NBC.fit(X_Train[SelectedFeatures], Y_Train)
Test_pred_NBC = NBC.predict( Test[SelectedFeatures] )
classifier_dict["Naïve Bayes"] = Test_pred_NBC #放入字典

# 決策樹(Decision Tree)
from sklearn.tree import DecisionTreeClassifier
DTC = RandomForestClassifier( )
DTC.fit( X_Train[SelectedFeatures], Y_Train )  # 自變數、應變數進行擬合
Test_pred_DTC = DTC.predict( Test[SelectedFeatures] )
classifier_dict["Decision Tree"] = Test_pred_DTC #放入字典

# 隨機森林參數：
from sklearn.ensemble import RandomForestClassifier
# n_estimators: 樹的數量(default=10)。
# min_samples_leaf: 最終葉節點最少樣本數(default=1)；
#                   當樣本不大時，可不設定使用預設，若樣本數量非常大時，則推薦增加此參數值。
# min_samples_split:節點再劃分時所需的最小樣本數(default=2)；
#                   當樣本不大時，可不設定使用預設，若樣本數量非常大時，則推薦增加此參數值。
# oob_score: 是否採用袋外樣本(out-of-bag samples)來評估模型的準確度(default=False)。
RFC = RandomForestClassifier( n_estimators = 1000,
                              min_samples_split = 20,
                              min_samples_leaf = 1,
                              oob_score = True,
                              random_state = 1,
                              n_jobs = -1 ) 
RFC_2 = RandomForestClassifier( n_estimators = 1000,
                                min_samples_split = 20,
                                min_samples_leaf = 1,
                                oob_score = True,
                                random_state = 1,
                                n_jobs = -1 ) 

# 篩選部份特徵欄位餵入模型進行訓練
SelectedFeatures = ['Age','Sex','LogFare','Title','Pclass','Sex_Pclass']
RFC_2.fit( X_Train[SelectedFeatures], Y_Train )

# 預測測試集資料
Test_pred_RFC = RFC_2.predict( Test[SelectedFeatures] )
classifier_dict["Random Forest"] = Test_pred_RFC #放入字典

# 建立 XGBClassifier 模型
from xgboost import XGBClassifier
XGBC = XGBClassifier(n_estimators=100, learning_rate= 0.3)
XGBC.fit(X_Train[SelectedFeatures], Y_Train)  # 使用訓練資料訓練模型
Test_pred_XGBC = XGBC.predict(Test[SelectedFeatures]) # 使用訓練資料預測分類
classifier_dict["XGBoost"] = Test_pred_XGBC

# 建立 MLPClassifier 模型
from sklearn.neural_network import MLPClassifier
MLPC = MLPClassifier(random_state=1, max_iter=400)
MLPC.fit(X_Train[SelectedFeatures],Y_Train)
Test_pred_MLPC = MLPC.predict(Test[SelectedFeatures])
classifier_dict["MLP"] = Test_pred_MLPC

#把結果印出來
for i in classifier_dict:
  deep_classified(classifier_dict[i], i)




This is the  Logistic classifier`:
Accuracy:  0.09569377990430622
Precision： 0.8589743589743589
Recall： 0.881578947368421
F1 Score： 0.8701298701298701
Confusion matrix： 
 [[244  22]
 [ 18 134]]


This is the  Naïve Bayes classifier`:
Accuracy:  0.07894736842105263
Precision： 0.8216216216216217
Recall： 1.0
F1 Score： 0.9020771513353116
Confusion matrix： 
 [[233  33]
 [  0 152]]


This is the  Decision Tree classifier`:
Accuracy:  0.16985645933014354
Precision： 0.7515527950310559
Recall： 0.7960526315789473
F1 Score： 0.7731629392971247
Confusion matrix： 
 [[226  40]
 [ 31 121]]


This is the  Random Forest classifier`:
Accuracy:  0.11004784688995216
Precision： 0.8680555555555556
Recall： 0.8223684210526315
F1 Score： 0.8445945945945945
Confusion matrix： 
 [[247  19]
 [ 27 125]]


This is the  XGBoost classifier`:
Accuracy:  0.1339712918660287
Precision： 0.8157894736842105
Recall： 0.8157894736842105
F1 Score： 0.8157894736842104
Confusion matrix： 
 [[238  28]
 [ 28 124]]


This is the  MLP cla