In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
url="https://storage.googleapis.com/bdt-spark-store/internal_data.csv"
df_ext=pd.read_csv(url)

url2="https://storage.googleapis.com/bdt-spark-store/external_sources.csv"
df_data=pd.read_csv(url2)

In [3]:
df_full = df_data.merge(df_ext, on='SK_ID_CURR', how='inner')
columns_extract = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                  'DAYS_BIRTH', 'DAYS_EMPLOYED', 'NAME_EDUCATION_TYPE',
                  'DAYS_ID_PUBLISH', 'CODE_GENDER', 'AMT_ANNUITY',
                  'DAYS_REGISTRATION', 'AMT_GOODS_PRICE', 'AMT_CREDIT',
                  'ORGANIZATION_TYPE', 'DAYS_LAST_PHONE_CHANGE',
                  'NAME_INCOME_TYPE', 'AMT_INCOME_TOTAL', 'OWN_CAR_AGE', 'TARGET']
df = df_full[columns_extract]

In [4]:
np.random.RandomState(101)
train, test = np.split(df.sample(frac=1), [int(.8*len(df))])
print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

Training Features shape:  (246008, 18)
Testing Features shape:  (61503, 18)


In [5]:
print(train.TARGET.value_counts()/len(train.index))
print(test.TARGET.value_counts()/len(test.index))

0    0.919503
1    0.080497
Name: TARGET, dtype: float64
0    0.918345
1    0.081655
Name: TARGET, dtype: float64


### One hot Encoding

In [6]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

Training Features shape:  (246008, 88)
Testing Features shape:  (61503, 87)


### Alignment

In [7]:
# Align the training and testing data, keep only columns present in both dataframes
train, test = train.align(test, join = 'inner', axis = 1)

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

Training Features shape:  (246008, 87)
Testing Features shape:  (61503, 87)


In [8]:
train_labels = train['TARGET']
test_labels = test['TARGET']

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer as Imputer

# Drop the target from the training data
if 'TARGET' in train:
    train = train.drop(columns = ['TARGET'])
    test = test.drop(columns = ['TARGET'])
else:
    train = train.copy()
    test = test.copy()
    
# Feature names
features = list(train.columns)

# Median imputation of missing values
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = StandardScaler()

# Fit on the training data
imputer.fit(train)

# Transform both training and testing data
train = imputer.transform(train)
test = imputer.transform(test)

scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (246008, 86)
Testing data shape:  (61503, 86)


In [10]:
from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, 
                                       random_state = 50, 
                                       verbose = 1, n_jobs = -1)
# Train on the training data
random_forest.fit(train, train_labels)

# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})

# Make predictions on the test data
predictions = random_forest.predict(test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   23.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.2s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.9s finished


In [11]:
from sklearn.metrics import accuracy_score, roc_auc_score

print(accuracy_score(test_labels, predictions))

0.9184917808887372


In [37]:
feature_importances.sort_values('importance', ascending=False).head(10)

Unnamed: 0,feature,importance
1,EXT_SOURCE_2,0.098773
2,EXT_SOURCE_3,0.08688
3,DAYS_BIRTH,0.076956
5,DAYS_ID_PUBLISH,0.0758
7,DAYS_REGISTRATION,0.075221
6,AMT_ANNUITY,0.071313
10,DAYS_LAST_PHONE_CHANGE,0.067805
4,DAYS_EMPLOYED,0.065869
9,AMT_CREDIT,0.06463
11,AMT_INCOME_TOTAL,0.056505
