In [2]:
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, MinMaxScaler
from sklearn.impute import SimpleImputer

train = pd.read_csv('application_train.csv')
test = pd.read_csv('application_test.csv')

In [64]:
def poly_features(train, test, degree):
    
    print('Training shape :', train.shape)
    print('Testing shape :', test.shape)

    poly_feat = train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET']]


    poly_feat_test = test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]


    imputer = SimpleImputer(strategy = 'median')

    poly_target = poly_feat['TARGET']
    poly_feat = poly_feat.drop(columns = ['TARGET'])

    poly_feat = imputer.fit_transform(poly_feat)
    poly_feat_test = imputer.transform(poly_feat_test)

    #polynomial transformations
    poly_transformer = PolynomialFeatures(degree = degree)

    poly_feat = poly_transformer.fit_transform(poly_feat)
    poly_feat_test = poly_transformer.transform(poly_feat_test)

    print('\nPolynomial features shapes :', poly_feat.shape)
    print('Polynomial features shapes:' , poly_feat_test.shape)



    poly_feat = pd.DataFrame(poly_feat, columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                               'EXT_SOURCE_3', 'DAYS_BIRTH']))

    poly_feat['TARGET'] = poly_target

    poly_corr = poly_feat.corr()['TARGET'].abs().sort_values(ascending = False)
    
    print('\nCorrelation between TARGET and polynomial features')
    print('')
    print(poly_corr)

    # Put test features into dataframe
    poly_feat_test = pd.DataFrame(poly_feat_test, 
                                      columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                                    'EXT_SOURCE_3', 'DAYS_BIRTH']))

    # Merge polynomial features into training dataframe
    poly_feat['SK_ID_CURR'] = train['SK_ID_CURR']
    train_poly = train.merge(poly_feat, on = 'SK_ID_CURR', how = 'left')

    # Merge polynomial features into testing dataframe
    poly_feat_test['SK_ID_CURR'] = test['SK_ID_CURR']
    test_poly = test.merge(poly_feat_test, on = 'SK_ID_CURR', how = 'left')

    train_poly, test_poly = train_poly.align(test_poly, join = 'inner', axis = 1)

    print('\nTraining polynomial data shapes :', train_poly.shape)
    print('Testing polynomial data shapes :', test_poly.shape)
    
    return train_poly, test_poly

In [65]:
train_poly, test_poly = poly_features(train, test, 3)

Training shape : (307511, 122)
Testing shape : (48744, 121)

Polynomial features shapes : (307511, 35)
Polynomial features shapes: (48744, 35)

Correlation between TARGET and polynomial features

TARGET                                    1.000000
EXT_SOURCE_2 EXT_SOURCE_3                 0.193939
EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3    0.189605
EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH      0.181283
EXT_SOURCE_2^2 EXT_SOURCE_3               0.176428
EXT_SOURCE_2 EXT_SOURCE_3^2               0.172282
EXT_SOURCE_1 EXT_SOURCE_2                 0.166625
EXT_SOURCE_1 EXT_SOURCE_3                 0.164065
EXT_SOURCE_2                              0.160295
EXT_SOURCE_2 DAYS_BIRTH                   0.156873
EXT_SOURCE_1 EXT_SOURCE_2^2               0.156867
EXT_SOURCE_3                              0.155892
EXT_SOURCE_1 EXT_SOURCE_2 DAYS_BIRTH      0.155891
EXT_SOURCE_1 EXT_SOURCE_3 DAYS_BIRTH      0.151816
EXT_SOURCE_1 EXT_SOURCE_3^2               0.150822
EXT_SOURCE_3 DAYS_BIRTH                

In [66]:
print(train_poly.shape)
print(test_poly.shape)

(307511, 156)
(48744, 156)
