In [7]:
# Import 需要的套件
import os
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

# 之前做過的處理

In [8]:
# 設定 data_path
dir_data = './Data/'
f_app_train = os.path.join(dir_data, 'application_train.csv')
f_app_test = os.path.join(dir_data, 'application_test.csv')

app_train = pd.read_csv(f_app_train)
app_test = pd.read_csv(f_app_test)

from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
#label encoder是一個可以用來將標籤規範化的工具類，它可以將標籤的編碼值範圍限定在[0,n_classes-1]。
le = LabelEncoder() 
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])  #LabelEncoder=le
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col]) #將類別變數轉換成數字
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

# Create an anomalous flag column
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
# also apply to testing dataset
app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

# absolute the value of DAYS_BIRTH
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_test['DAYS_BIRTH'] = abs(app_test['DAYS_BIRTH'])

# 做好前處理

開始擬合模型之前，我們要確保 training & testing data 的欄位數量一致，原因是因為 One hot encoding 會製造多的欄位，有些類別出現在 training data 而沒有出現 testing data 中，我們就要把這些多餘的欄位去除

In [14]:
train_labels = app_train['TARGET']

# 將訓練&測試集資料對齊(align)，讓這兩個資料集的COLUMN數一樣
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

# 確認完一致後，再把目標值放回去
app_train['TARGET'] = train_labels

In [17]:
from sklearn.preprocessing import MinMaxScaler, Imputer

# 先把目標變數從訓練集拿掉->令為train資料集
if 'TARGET' in app_train:
    train = app_train.drop(labels = ['TARGET'], axis=1)
else:
    train = app_train.copy()
    
# Feature names
features = list(train.columns)

In [20]:
#看所有變數名稱 
features = list(train.columns)
#features

In [24]:
# 複製，測試集->令為test資料集
test = app_test.copy()

# 針對NA值，以中位數取代，所以之後的imputer.fit就是把這個模式套用在該資料上
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))
#MinMaxScaler:將屬性縮放到一個範圍值，此為0~1
#目的為: 1. 對於方差非常小的屬性可以增強穩定性 2.維持稀疏矩陣(Sparse matrix)中為0的項目

# 將train資料集的NA值用中位數取代
imputer.fit(train)

# 將TRAIN & TEST做NA轉換
train = imputer.transform(train)
test = imputer.transform(app_test)

# 將TRAIN & TEST做特徵縮放
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (307511, 240)
Testing data shape:  (48744, 240)


# Fit the model

In [25]:
from sklearn.linear_model import LogisticRegression

# 把模型的規則在這邊調整好，之後直接log_reg.fit套用
log_reg = LogisticRegression(C = 0.0001)

# Train 訓練集
log_reg.fit(train, train_labels)

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

模型 fit 好以後，就可以用來預測 testing data 中的客戶違約遲繳貸款的機率! (記得要用 predict_proba 才會輸出機率)

In [26]:
# 預測~
# Make sure to select the second column only
log_reg_pred = log_reg.predict_proba(test)[:, 1]  #[:, 1]表只取出我們要的機率值

# 儲存預測結果

In [28]:
# Submission dataframe
submit = app_test[['SK_ID_CURR']] #先挑選SK_ID_CURR為submit的第一行變數
submit['TARGET'] = log_reg_pred   #再將結果放在第二行變數

submit.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.065051
1,100005,0.126401
2,100013,0.081239
3,100028,0.061509
4,100038,0.128308


In [31]:
submit.to_csv('Day_16_result.csv',index=False) #輸出成csv