In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import statsmodels.formula.api as sm
from sklearn.linear_model import LogisticRegression
import warnings

print(os.listdir("../input"))

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

plt.style.use('seaborn')

sns.set(font_scale=1)

pd.set_option('display.max_columns', None)

['test.csv', 'train.csv', 'sample_submission.csv']


In [2]:
random_state = 635
np.random.seed(random_state)
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

In [3]:
train_X = df_train.drop(['ID_code', 'target'], axis = 1)
test_X = df_test.drop(['ID_code'], axis = 1)

In [4]:
# Scaling
scaler = StandardScaler()
train_X_scaled = pd.DataFrame(scaler.fit_transform(train_X),columns = train_X.columns)
test_X_scaled = pd.DataFrame(scaler.fit_transform(test_X),columns = test_X.columns)

In [5]:
#df_train_target = pd.DataFrame(df_train['target'], columns = ['target'])
#train_X_scaled_y = df_train_target.merge(train_X_scaled, left_index = True, right_index = True)

In [6]:
#train_X_scaled.describe().applymap('{:,.2f}'.format)

In [7]:
# PCA
pca = PCA()  
factors_train = pca.fit_transform(train_X_scaled) 
factors_test = pca.transform(test_X_scaled)

In [8]:
# replace 200 vars with PCA features
pca_columns_name = ["pca_" + str(col) for col in range(0, 200)]
factors_train = pd.DataFrame(factors_train, columns = pca_columns_name)
factors_test = pd.DataFrame(factors_test, columns = pca_columns_name)

train_pca = df_train.merge(factors_train, left_index = True, right_index = True)
test_pca = df_test.merge(factors_test, left_index = True, right_index = True)

train_pca_only = train_pca.drop(train_X.columns, axis = 1)
test_pca_only = test_pca.drop(test_X.columns, axis = 1)

In [9]:
# STATSMODELS LOGIT VERSION
# Logit regression to check wich of the factors are insignificant for the target
formula = 'target ~ '
for col in train_pca_only.drop(['ID_code', 'target'], axis = 1).columns:
    if( col == 'pca_0'):
        formula += str(col)
    else:
        formula += "+" + str(col)
formula

'target ~ pca_0+pca_1+pca_2+pca_3+pca_4+pca_5+pca_6+pca_7+pca_8+pca_9+pca_10+pca_11+pca_12+pca_13+pca_14+pca_15+pca_16+pca_17+pca_18+pca_19+pca_20+pca_21+pca_22+pca_23+pca_24+pca_25+pca_26+pca_27+pca_28+pca_29+pca_30+pca_31+pca_32+pca_33+pca_34+pca_35+pca_36+pca_37+pca_38+pca_39+pca_40+pca_41+pca_42+pca_43+pca_44+pca_45+pca_46+pca_47+pca_48+pca_49+pca_50+pca_51+pca_52+pca_53+pca_54+pca_55+pca_56+pca_57+pca_58+pca_59+pca_60+pca_61+pca_62+pca_63+pca_64+pca_65+pca_66+pca_67+pca_68+pca_69+pca_70+pca_71+pca_72+pca_73+pca_74+pca_75+pca_76+pca_77+pca_78+pca_79+pca_80+pca_81+pca_82+pca_83+pca_84+pca_85+pca_86+pca_87+pca_88+pca_89+pca_90+pca_91+pca_92+pca_93+pca_94+pca_95+pca_96+pca_97+pca_98+pca_99+pca_100+pca_101+pca_102+pca_103+pca_104+pca_105+pca_106+pca_107+pca_108+pca_109+pca_110+pca_111+pca_112+pca_113+pca_114+pca_115+pca_116+pca_117+pca_118+pca_119+pca_120+pca_121+pca_122+pca_123+pca_124+pca_125+pca_126+pca_127+pca_128+pca_129+pca_130+pca_131+pca_132+pca_133+pca_134+pca_135+pca_136+pca_

In [10]:
# logit model 1 (based on all 200 pca factors)
logit = sm.ols(formula = formula, data = train_pca_only)
logit_trained = logit.fit()
logit_trained.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.182
Model:,OLS,Adj. R-squared:,0.181
Method:,Least Squares,F-statistic:,221.9
Date:,"Tue, 09 Apr 2019",Prob (F-statistic):,0.0
Time:,05:50:23,Log-Likelihood:,-23370.0
No. Observations:,200000,AIC:,47140.0
Df Residuals:,199799,BIC:,49190.0
Df Model:,200,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1005,0.001,165.162,0.000,0.099,0.102
pca_0,0.1144,0.001,207.952,0.000,0.113,0.116
pca_1,0.0021,0.001,3.486,0.000,0.001,0.003
pca_2,0.0008,0.001,1.372,0.170,-0.000,0.002
pca_3,-0.0007,0.001,-1.230,0.219,-0.002,0.000
pca_4,4.721e-05,0.001,0.080,0.936,-0.001,0.001
pca_5,0.0012,0.001,1.982,0.048,1.28e-05,0.002
pca_6,-0.0003,0.001,-0.541,0.589,-0.001,0.001
pca_7,-0.0031,0.001,-5.217,0.000,-0.004,-0.002

0,1,2,3
Omnibus:,74439.131,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,231262.637
Skew:,1.982,Prob(JB):,0.0
Kurtosis:,6.47,Cond. No.,1.14


In [11]:
#dir(logit)
#logit.data.formula

In [12]:
# new logit formula for significant pca factors only (P>|t| <= 0.05)
# manual exclusion
formula_sig = 'target ~ pca_193+pca_180+pca_174+pca_156+pca_146+pca_143+pca_127+pca_115+pca_114+pca_112+pca_111+pca_109+pca_104+pca_86+pca_79+pca_74+pca_55+pca_50+pca_46+pca_45+pca_43+pca_40+pca_34+pca_29+pca_16+pca_12+pca_8+pca_7+pca_1+pca_0+pca_163+pca_142+pca_78+pca_66+pca_42+pca_23+pca_182+pca_136+pca_135+pca_120+pca_108+pca_62+pca_58+pca_51+pca_48+pca_123+pca_138+pca_102+pca_97+pca_53+pca_33+pca_158+pca_141+pca_71+pca_179+pca_169+pca_67+pca_57+pca_94+pca_152+pca_126+pca_96+pca_85+pca_37+pca_84+pca_9+pca_31+pca_150+pca_22+pca_177+pca_98+pca_64+pca_92+pca_178+pca_133+pca_61+pca_95+pca_100+pca_176+pca_5+pca_15'

In [13]:
# logit model 2 (based only on significant pca factors)
logit = sm.ols(formula = formula, data = train_pca_only)
logit_trained = logit.fit()
logit_trained.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.182
Model:,OLS,Adj. R-squared:,0.181
Method:,Least Squares,F-statistic:,221.9
Date:,"Tue, 09 Apr 2019",Prob (F-statistic):,0.0
Time:,05:50:31,Log-Likelihood:,-23370.0
No. Observations:,200000,AIC:,47140.0
Df Residuals:,199799,BIC:,49190.0
Df Model:,200,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1005,0.001,165.162,0.000,0.099,0.102
pca_0,0.1144,0.001,207.952,0.000,0.113,0.116
pca_1,0.0021,0.001,3.486,0.000,0.001,0.003
pca_2,0.0008,0.001,1.372,0.170,-0.000,0.002
pca_3,-0.0007,0.001,-1.230,0.219,-0.002,0.000
pca_4,4.721e-05,0.001,0.080,0.936,-0.001,0.001
pca_5,0.0012,0.001,1.982,0.048,1.28e-05,0.002
pca_6,-0.0003,0.001,-0.541,0.589,-0.001,0.001
pca_7,-0.0031,0.001,-5.217,0.000,-0.004,-0.002

0,1,2,3
Omnibus:,74439.131,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,231262.637
Skew:,1.982,Prob(JB):,0.0
Kurtosis:,6.47,Cond. No.,1.14
