In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Logit
from sklearn.preprocessing import Imputer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
df_train = pd.read_csv("train_final.csv")
df_test = pd.read_csv("test_final.csv")

In [3]:
df_train.head()

Unnamed: 0,TARGET,SK_ID_CURR,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_BIRTH_NORM,CREDIT_TERM,ANNUITY_INCOME_PERCENT,DAYS_EMPLOYED_PERCENT,GROUP_EXT_SOURCES_MEDIAN,PREV_Cash_SIMPLE_INTERESTS_MAX,PREV_Cash_SIMPLE_INTERESTS_MEAN,EXT_SOURCES_PROD,EXT_SOURCES_WEIGHTED
0,1.0,100002,0.083,0.263,0.1394,1.506891,0.060749,0.121978,0.067329,0.365,,,0.003044,0.847
1,0.0,100003,0.3113,0.622,,-0.166811,0.027598,0.132217,0.070862,0.5845,0.01162,0.01162,,
2,0.0,100004,,0.556,0.7295,-0.6895,0.05,0.1,0.011814,0.5664,,,,
3,0.0,100006,,0.6504,,-0.680105,0.094941,0.2199,0.159958,0.567,0.02742,0.01924,,
4,0.0,100007,,0.3228,,-0.892525,0.042623,0.179963,0.152418,0.5693,0.03616,0.0249,,


In [4]:
y_train = df_train["TARGET"]
X_ptrain = df_train.drop(["TARGET", "SK_ID_CURR"], axis = 1)
X_ptest = df_test.drop(["SK_ID_CURR"], axis = 1)

In [5]:
# Median imputation of missing values
imputer = Imputer(strategy = 'median')

In [6]:
# Fit on the training data
imputer.fit(X_ptrain)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

In [7]:
# Transform both training and testing data
train = pd.DataFrame(imputer.transform(X_ptrain))
test = pd.DataFrame(imputer.transform(X_ptest))

## Logit from statsmodels

In [8]:
X_train = sm.add_constant(train)
X_test = sm.add_constant(test)

In [9]:
model = Logit(y_train, X_train)
results = model.fit()
print(results.summary())

Optimization terminated successfully.
         Current function value: 0.254144
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                 TARGET   No. Observations:               307506
Model:                          Logit   Df Residuals:                   307493
Method:                           MLE   Df Model:                           12
Date:                Sat, 22 Jun 2019   Pseudo R-squ.:                 0.09409
Time:                        23:19:33   Log-Likelihood:                -78151.
converged:                       True   LL-Null:                       -86268.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.2329      0.095     -2.458      0.014      -0.419      -0.047
0             -2.0015      0.

## Logistic Regression hyperparameters tuning

In [11]:
#from sklearn.model_selection import GridSearchCV
#from sklearn.linear_model import LogisticRegression
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logit=LogisticRegression()
logit_cv=GridSearchCV(logit, grid, cv=10)
logit_cv.fit(train, y_train)

print("tuned hpyerparameters :(best parameters) ",logit_cv.best_params_)
print("accuracy :",logit_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 1.0, 'penalty': 'l2'}
accuracy : 0.9193088915338238


In [15]:
logitTuned = LogisticRegression(penalty= 'l1', C = 1.0)

In [16]:
logitTuned.fit(train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
y_pred = logitTuned.predict_proba(test)[:, 1]

In [18]:
submit = df_test[['SK_ID_CURR']]
submit['TARGET'] = y_pred

submit.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.052161
1,100005,0.091525
2,100013,0.025844
3,100028,0.051595
4,100038,0.118015


In [14]:
submit.to_csv('LOGITfinal2.csv', index = False)

In [19]:
rocauc = 0.71976

In [20]:
gini = (2*rocauc) - 1
gini

0.4395199999999999