In [1]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split


In [2]:
# Mengakses data menggunakan pandas
filepath = '/content/drive/MyDrive/Rakamin/application_train.csv'
df1 = pd.read_csv(filepath)
filepath = '/content/drive/MyDrive/Rakamin/application_test.csv'
df2 = pd.read_csv(filepath)

In [None]:
df1.shape

(307511, 122)

In [3]:
# Mengubah Categorical Data menjadi Numeric Data
cat_columns = df1.select_dtypes(['object']).columns
df1[cat_columns] = df1[cat_columns].apply(lambda x: pd.factorize(x)[0])

In [4]:
# Test apakah ngedrop NA lebih baik daripada mengisi dengan Zero
# df.dropna(inplace = True)
df1.fillna(0, inplace = True)

In [None]:
# Mengecek apakah ada nilai negatif di column
listcheck = []
for i in df1.columns:
    if df1[i].min() < 0:
        listcheck.append(i)
listcheck

['NAME_TYPE_SUITE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OCCUPATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE',
 'DAYS_LAST_PHONE_CHANGE']

In [None]:
# Mengeluarkan Kolom yang memiliki nilai negatif dan colomn primary key
X_train = df1.drop(['SK_ID_CURR', 
             'NAME_TYPE_SUITE',
             'DAYS_BIRTH',
             'DAYS_EMPLOYED',
             'DAYS_REGISTRATION',
             'DAYS_ID_PUBLISH',
             'OCCUPATION_TYPE',
             'FONDKAPREMONT_MODE',
             'HOUSETYPE_MODE',
             'WALLSMATERIAL_MODE',
             'DAYS_LAST_PHONE_CHANGE',
             'EMERGENCYSTATE_MODE',
             'DAYS_LAST_PHONE_CHANGE',
             'TARGET'], axis = 1)
y_train = df1['TARGET']

In [None]:
# Menggunakan feaure selection untuk memilih fitur terbaik yang akan digunakan untuk training
bestFeatures = SelectKBest(score_func = chi2, k=len(X_train.columns))
fit = bestFeatures.fit(X_train, y_train)
dfscores = pd.DataFrame([int(x) for x in fit.scores_ if not (math.isnan(x))])
dfcolumns = pd.DataFrame(X_train.columns)
featurescores = pd.concat([dfcolumns, dfscores], axis = 1)
featurescores.columns = ['Attribue', 'Score']
featurescores.sort_values('Score', ascending = False).head(20)

Unnamed: 0,Attribue,Score
8,AMT_GOODS_PRICE,122270879
6,AMT_CREDIT,76699867
5,AMT_INCOME_TOTAL,1624101
7,AMT_ANNUITY,390110
9,NAME_INCOME_TYPE,1360
35,EXT_SOURCE_3,772
30,REG_CITY_NOT_WORK_CITY,615
34,EXT_SOURCE_2,560
29,REG_CITY_NOT_LIVE_CITY,558
33,EXT_SOURCE_1,478


In [None]:
# Melakukan Preprocessing padata Test Dataset
cat_columns2 = df2.select_dtypes(['object']).columns
df2[cat_columns2] = df2[cat_columns2].apply(lambda x: pd.factorize(x)[0])
df2.fillna(0, inplace = True)

test_dataset = df2.drop(['SK_ID_CURR', 
             'NAME_TYPE_SUITE',
             'DAYS_BIRTH',
             'DAYS_EMPLOYED',
             'DAYS_REGISTRATION',
             'DAYS_ID_PUBLISH',
             'OCCUPATION_TYPE',
             'FONDKAPREMONT_MODE',
             'HOUSETYPE_MODE',
             'WALLSMATERIAL_MODE',
             'DAYS_LAST_PHONE_CHANGE',
             'EMERGENCYSTATE_MODE',
             'DAYS_LAST_PHONE_CHANGE'], axis = 1)

In [5]:
# Menyiapkan train data untuk melatih di model.
train_data = df1.filter(['AMT_GOODS_PRICE', 'AMT_CREDIT', 'AMT_INCOME_TOTAL', 'AMT_ANNUITY', 'TARGET'], axis = 1)
train_label = train_data['TARGET']
train_var = train_data.drop(['TARGET'], axis = 1)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(train_var, train_label, test_size = 0.30)

Random Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier


clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_valid)

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_valid, y_pred))

Accuracy: 0.9034296615864895


Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)

LogisticRegression()

In [None]:
predictions = logisticRegr.predict(X_valid)

In [None]:
score = logisticRegr.score(X_valid, y_valid)
print(score)

0.9176946257072864


Neural Network

In [7]:
tf.convert_to_tensor(X_train)

<tf.Tensor: shape=(215257, 4), dtype=float64, numpy=
array([[ 922500. , 1178217. ,  144000. ,   34578. ],
       [  85500. ,  103558.5,  180000. ,    6462. ],
       [ 481500. ,  519633. ,  292500. ,   41184. ],
       ...,
       [ 238500. ,  295168.5,  135000. ,   16011. ],
       [ 225000. ,  239850. ,  103500. ,   23719.5],
       [ 202500. ,  202500. ,   67500. ,   10125. ]])>

In [8]:
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(X_train)

In [19]:
tf.convert_to_tensor(X_valid)

<tf.Tensor: shape=(92254, 4), dtype=float64, numpy=
array([[ 450000. ,  521280. ,  180000. ,   19984.5],
       [1354500. , 1569051. ,  315000. ,   49396.5],
       [ 913500. , 1046142. ,  315000. ,   30717. ],
       ...,
       [ 576000. ,  667237.5,  202500. ,   37386. ],
       [ 450000. ,  450000. ,  225000. ,   53536.5],
       [ 589500. ,  822942. ,  202500. ,   26676. ]])>

In [15]:
def get_basic_model():
  model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1)
  ])

  model.compile(optimizer='adam',
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=['accuracy'])
  return model

In [20]:
model = get_basic_model()
model.fit(X_train, y_train, validation_data = (X_valid, y_valid), epochs=15, batch_size=32)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fa6ad6a3250>