In [3]:
# Import 需要的套件
import os
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

### 之前做過的處理

In [4]:
# 設定 data_path
dir_data = './data/'
f_app_train = os.path.join(dir_data, 'application_train.csv')
f_app_test = os.path.join(dir_data, 'application_test.csv')

app_train = pd.read_csv(f_app_train)
app_test = pd.read_csv(f_app_test)

from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories (Erwin: 找 Classification 的欄位, 也就是 unique 小於等於 2 的)
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            # Erwin: 
            #     fit: Fit label encoder, 產生 le.classes_ (也就是 label, categories, ... etc.)
            le.fit(app_train[col])
            # Transform both training and testing data
            # Erwin
            #    transform() 就是把 column 重新編碼過, 變成 [0,1,1,2,0,..., etc..]
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

# Create an anomalous flag column
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
# also apply to testing dataset
app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

# absolute the value of DAYS_BIRTH
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_test['DAYS_BIRTH'] = abs(app_test['DAYS_BIRTH'])


### 做好前處理
開始擬合模型之前，我們要確保 training & testing data 的欄位數量一致，原因是因為 One hot encoding 會製造多的欄位，有些類別出現在 training data 而沒有出現 testing data 中，我們就要把這些多餘的欄位去除

In [5]:
train_labels = app_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

# Add the target back in
app_train['TARGET'] = train_labels

In [6]:
from sklearn.preprocessing import MinMaxScaler, Imputer

In [7]:
app_train.describe()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,TARGET
count,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0
mean,278180.518577,0.095213,0.340108,0.693673,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,...,0.03009,0.007466,0.005785,0.005284,0.214757,0.210773,0.017437,0.518446,0.00757,0.080729
std,102790.175348,0.293509,0.473746,0.460968,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,...,0.170835,0.086085,0.07584,0.072501,0.410654,0.407858,0.130892,0.49966,0.086679,0.272419
min,100002.0,0.0,0.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,1.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,367142.5,0.0,1.0,1.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,456255.0,1.0,1.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Drop the target from the training data
if 'TARGET' in app_train:
    train = app_train.drop(labels = ['TARGET'], axis=1)
else:
    train = app_train.copy()
    
train.describe()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
count,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0
mean,278180.518577,0.095213,0.340108,0.693673,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,...,0.003941,0.03009,0.007466,0.005785,0.005284,0.214757,0.210773,0.017437,0.518446,0.00757
std,102790.175348,0.293509,0.473746,0.460968,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,...,0.062656,0.170835,0.086085,0.07584,0.072501,0.410654,0.407858,0.130892,0.49966,0.086679
min,100002.0,0.0,0.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,1.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,367142.5,0.0,1.0,1.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,456255.0,1.0,1.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
# Feature names
features = list(train.columns)

features

['SK_ID_CURR',
 'NAME_CONTRACT_TYPE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NON

In [16]:
len(features)

240

In [17]:
# Copy of the testing data
test = app_test.copy()

In [18]:
# Median imputation of missing values
#    In statistics, imputation is the process of replacing missing data with substituted values.
'''
strategy：替换策略，字符串，默认用均值‘mean’替换
①若为mean时，用特征列的均值替换
②若为median时，用特征列的中位数替换
③若为most_frequent时，用特征列的众数替换
'''
imputer = Imputer(strategy = 'median')



In [19]:
# Scale each feature to 0-1
# MinMaxScaler
#    Transforms features by scaling each feature to a given range.
#    This estimator scales and translates each feature individually such that it is 
#    in the given range on the training set, e.g. between zero and one.
#
#    将属性缩放到一个指定的最大和最小值（通常是1-0）之间，这可以通过preprocessing.MinMaxScaler类实现。
#    使用这种方法的目的包括：
#    1、对于方差非常小的属性可以增强其稳定性。
#    2、维持稀疏矩阵中为0的条目。
scaler = MinMaxScaler(feature_range = (0, 1))

-------------

## 心得：

Label Encoder: 把 Categories 的欄位，轉化成數字

MinMax Scaler: 把 量化數字的欄位，轉化成 0~1 之間的數字

-----------

In [20]:
# Fit on the training data
imputer.fit(train)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

In [21]:
train.describe()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
count,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0
mean,278180.518577,0.095213,0.340108,0.693673,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,...,0.003941,0.03009,0.007466,0.005785,0.005284,0.214757,0.210773,0.017437,0.518446,0.00757
std,102790.175348,0.293509,0.473746,0.460968,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,...,0.062656,0.170835,0.086085,0.07584,0.072501,0.410654,0.407858,0.130892,0.49966,0.086679
min,100002.0,0.0,0.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,1.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,367142.5,0.0,1.0,1.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,456255.0,1.0,1.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
# Transform both training and testing data
train = imputer.transform(train)
test = imputer.transform(app_test)

In [27]:
# Now train / test have become to ndarray object which can not use train.describe()
print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (307511, 240)
Testing data shape:  (48744, 240)


In [28]:
# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (307511, 240)
Testing data shape:  (48744, 240)


In [34]:
# Now train is an array with (minmax) normalized data
train[0:2]

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 1.51186991e-03, 9.02865169e-02, 9.00315900e-02,
        7.74410774e-02, 2.56321139e-01, 1.11161218e-01, 9.64437249e-01,
        8.52140078e-01, 7.05432819e-01, 9.89010989e-02, 1.00000000e+00,
        1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 5.00000000e-01, 5.00000000e-01,
        4.34782609e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.22150150e-02,
        3.07542268e-01, 1.55054452e-01, 2.47000000e-02, 3.69000000e-02,
        9.72200000e-01, 6.19200000e-01, 1.43000000e-02, 0.00000000e+00,
        6.90000000e-02, 8.33000000e-02, 1.25000000e-01, 3.69000000e-02,
        2.02000000e-02, 1.90000000e-02, 0.00000000e+00, 0.00000000e+00,
        2.52000000e-02, 3.83000000e-02, 9.72200000e-01, 6.34100000e-01,
        1.44000000e-02, 0.00000000e+00, 6.90000000e-02, 8.330000

### Fit the model

In [35]:
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
#   Erwin: C=0.0001 means ?
log_reg = LogisticRegression(C = 0.0001)

# Train on the training data
#   Notes:  train_labels = app_train['TARGET'], so train_labels=[0,1,...]
log_reg.fit(train, train_labels)



LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

-------

#### penalty : str, ‘l1’ or ‘l2’, default: ‘l2’
Used to specify the norm used in the penalization. The ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers support only l2 penalties.

New in version 0.19: l1 penalty with SAGA solver (allowing ‘multinomial’ + L1)

#### dual : bool, default: False
#### Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features.

#### tol : float, default: 1e-4
Tolerance for stopping criteria.

#### C : float, default: 1.0
Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.

#### fit_intercept : bool, default: True
Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.

#### intercept_scaling : float, default 1.
Useful only when the solver ‘liblinear’ is used and self.fit_intercept is set to True. In this case, x becomes [x, self.intercept_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight.

Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.

#### class_weight : dict or ‘balanced’, default: None
Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one.

The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).

Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.

New in version 0.17: class_weight=’balanced’

#### random_state : int, RandomState instance or None, optional, default: None
The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. Used when solver == ‘sag’ or ‘liblinear’.

#### solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’.
Algorithm to use in the optimization problem.

For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.
For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.
‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty, whereas ‘liblinear’ and ‘saga’ handle L1 penalty.
Note that ‘sag’ and ‘saga’ fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing.

New in version 0.17: Stochastic Average Gradient descent solver.

New in version 0.19: SAGA solver.

Changed in version 0.20: Default will change from ‘liblinear’ to ‘lbfgs’ in 0.22.

#### max_iter : int, default: 100
Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge.

#### multi_class : str, {‘ovr’, ‘multinomial’, ‘auto’}, default: ‘ovr’
If the option chosen is ‘ovr’, then a binary problem is fit for each label. For ‘multinomial’ the loss minimised is the multinomial loss fit across the entire probability distribution, even when the data is binary. ‘multinomial’ is unavailable when solver=’liblinear’. ‘auto’ selects ‘ovr’ if the data is binary, or if solver=’liblinear’, and otherwise selects ‘multinomial’.

New in version 0.18: Stochastic Average Gradient descent solver for ‘multinomial’ case.

Changed in version 0.20: Default will change from ‘ovr’ to ‘auto’ in 0.22.

#### verbose : int, default: 0
For the liblinear and lbfgs solvers set verbose to any positive number for verbosity.

#### warm_start : bool, default: False
When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. See the Glossary.

New in version 0.17: warm_start to support lbfgs, newton-cg, sag, saga solvers.

#### n_jobs : int or None, optional (default=None)
Number of CPU cores used when parallelizing over classes if multi_class=’ovr’”. This parameter is ignored when the solver is set to ‘liblinear’ regardless of whether ‘multi_class’ is specified or not. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details.

------

模型 fit 好以後，就可以用來預測 testing data 中的客戶違約遲繳貸款的機率咯! (記得要用 predict_proba 才會輸出機率)

In [36]:
# Make predictions
# Make sure to select the second column only
log_reg_pred = log_reg.predict_proba(test)[:, 1]

In [38]:
print(f'type(log_reg_pred)={type(log_reg_pred)}')

type(log_reg_pred)=<class 'numpy.ndarray'>


In [39]:
log_reg_pred[:5]

array([0.06505115, 0.12640086, 0.08123883, 0.06150936, 0.12830764])

### 儲存預測結果

In [40]:
# Submission dataframe
submit = app_test[['SK_ID_CURR']]

submit.describe()

Unnamed: 0,SK_ID_CURR
count,48744.0
mean,277796.67635
std,103169.547296
min,100001.0
25%,188557.75
50%,277549.0
75%,367555.5
max,456250.0


In [41]:
submit['TARGET'] = log_reg_pred

submit.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.065051
1,100005,0.126401
2,100013,0.081239
3,100028,0.061509
4,100038,0.128308


## 練習時間
將你的結果存成 csv, 上傳你的第一份 Kaggle 成績

Hints: https://stackoverflow.com/questions/16923281/pandas-writing-dataframe-to-csv-file