In [133]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import pandas_profiling
import random
import seaborn as sns
import warnings
from matplotlib import pyplot as plt
from sklearn import preprocessing
import pandas as pd
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn import metrics
warnings.filterwarnings("ignore")
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [134]:
DATA_DIR = '/kaggle/input/sf-scoring/'
df_train = pd.read_csv(DATA_DIR +'/train.csv')
df_test = pd.read_csv(DATA_DIR +'/test.csv')
sample_submission = pd.read_csv(DATA_DIR+'/sample_submission.csv')

In [135]:
sample_submission.shape

In [136]:
df_test.shape

In [137]:
df_train.info()

In [138]:
df_train.head(5)

In [139]:
df_test.info()

In [140]:
sample_submission.head(5)

In [141]:
sample_submission.info()

In [142]:
# ВАЖНО! дря корректной обработки признаков объединяем трейн и тест в один датасет
df_train['sample'] = 1 # помечаем где у нас трейн
df_test['sample'] = 0  # помечаем где у нас тест
df_test['default'] = 0 # в тесте у нас нет значения default, мы его должны предсказать, по этому пока просто заполняем нулями

data = df_test.append(df_train, sort=False).reset_index(drop=True) # объединяем

In [143]:
#pandas_profiling.ProfileReport(data, explorative=True)

In [144]:
# there is some missing values in 'education' I'll fill them random way

ed_list = ['SCH', 'GRD', 'UGR', 'PGR', 'ACD']
data['education'] = data['education'].fillna(random.choice(ed_list))

In [145]:
## will change datetime to numeric
data['app_date'] = pd.to_datetime(data['app_date']).astype(np.int64)

In [146]:
data.nunique(dropna=False)

In [147]:
num_cols = ['age', 'score_bki', 'decline_app_cnt', 'bki_request_cnt', 'income']
cat_cols = ['education', 'first_time', 'sna', 'work_address', 'home_address', 'region_rating']
bin_cols = ['sex', 'car', 'car_type', 'good_work', 'foreign_passport']

In [148]:
data['default'].value_counts(ascending=True).plot(kind='barh')

In [149]:
## Let's log(numerical columns) and draw distplot and box plot for them
num_cols_log = ['age', 'decline_app_cnt', 'bki_request_cnt', 'income']
for i in num_cols_log:
    data[i]= data[i].apply(lambda x: np.log(x+1))
    
for i in num_cols:
    plt.figure()
    sns.distplot(data[i][data[i] > 0].dropna(), kde = False, rug=False)
    plt.title(i)
    plt.show()    

In [150]:
## compare default by age
sns.set_theme(style="whitegrid")
sns.boxplot(x = data['default'], y = data['age'])

In [151]:
## compare default by decline_app_cnt
sns.set_theme(style="whitegrid")
sns.boxplot(x = data['default'], y = data['decline_app_cnt'])
print(data.groupby('default')['decline_app_cnt'].mean())

In [152]:
## compare default by bki_request_cnt
sns.set_theme(style="whitegrid")
sns.boxplot(x = data['default'], y = data['bki_request_cnt'])

In [153]:
## compare default by income
sns.set_theme(style="whitegrid")
sns.boxplot(x = data['default'], y = data['income'])
print(data.groupby('default')['income'].mean())

In [154]:
## now it's time for corr anlisys for numerical_columns 
sns.heatmap(data[num_cols].corr().abs(), vmin=0, vmax=1)
data[num_cols].corr()

all the stuff in numerical_cols are highly idependent, that's good

In [155]:
## significance assessments for num_cols_log will use f_classif
imp_num = pd.Series(f_classif(data[num_cols_log], data['default'])[0], index = num_cols_log)
imp_num.sort_values(inplace = True)
imp_num.plot(kind = 'barh')

In [156]:
## delete client_id and app_date cols
data.drop(['client_id', 'app_date'], axis = 1, inplace=True)

In [157]:
## prepare data for model
le = preprocessing.LabelEncoder()

for column in bin_cols:
    data[column] = le.fit_transform(data[column])
    
columns = ['first_time', 'sna', 'work_address', 'home_address', 'region_rating']

for column in columns:
    data[column] = le.fit_transform(data[column])

In [158]:
## significance assessments for binar and categorical cols will use mutual_classif
# will use LabelEncoder for 'education' column to prepare data
label_encoder = LabelEncoder()
data['education'] = label_encoder.fit_transform(data['education'])

imp_cat = pd.Series(mutual_info_classif(data[bin_cols + cat_cols], data['default'],
                                     discrete_features =True), index = bin_cols + cat_cols )
imp_cat.sort_values(inplace = True)
imp_cat.plot(kind = 'barh')

In [159]:
data = pd.get_dummies(data, columns=['education','first_time', 'sna', 'work_address', 'home_address', 'region_rating'])
data.columns

In [160]:
# Теперь выделим тестовую часть
train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample', 'default'], axis=1)

y = train_data['default'].values  # наш таргет
X = train_data.drop(['default'], axis=1)

In [161]:
# Воспользуемся специальной функцие train_test_split для разбивки тестовых данных
# выделим 20% данных на валидацию (параметр test_size)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [162]:
# проверяем
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [173]:
## standartization of num_cols
X_train[num_cols] = StandardScaler().fit_transform(X_train[num_cols].values)
X_test[num_cols] = StandardScaler().fit_transform(X_test[num_cols].values)

In [164]:
## prepare Logistic Regression model
logreg = LogisticRegression(C = 2,
	class_weight = 'balanced',
	dual = False,
	fit_intercept= True,
	intercept_scaling= 1,
	l1_ratio= None,
	max_iter= 100000,
	multi_class= 'ovr',
	n_jobs= None,
	penalty= 'l2',
	random_state= None,
	solver= 'sag',
	tol= 0.001,
	verbose= 0,
	warm_start= False)
logreg.fit(X_train, y_train)
y_pred = logreg.predict_proba(X_test)
y_predd = logreg.predict(X_test)
y_pred = y_pred[:,1]

In [175]:
## will use confusion_matrix to understand how good is the model
confusion_matrix(y_test, y_predd)

In [166]:
## also draw roc_auc curve to have graphic info about model quality
def calc_and_plot_roc(y_test, y_pred):
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    plt.figure(figsize=(8, 8))
    plt.plot([0, 1], label='Baseline', linestyle='--')
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
    plt.title('Receiver Operating Characteristic', fontsize=15)
    plt.xlabel('False positive rate (FPR)', fontsize=15)
    plt.ylabel('True positive rate (TPR)', fontsize=15)
    plt.legend(fontsize=15)
    
calc_and_plot_roc(y_test, y_pred)

In [167]:
## Also I have tried GridSearch, but result was same, also it showed hell a lot of mistakes:)
'''
from sklearn.model_selection import GridSearchCV


model = LogisticRegression()

iter_ = 50
epsilon_stop = 1e-3
     
        
param_grid = [
    {'penalty': ['l1'], 
     'solver': ['liblinear', 'lbfgs'], 
     'class_weight':['none', 'balanced'], 
     'multi_class': ['auto','ovr'], 
     'max_iter':[iter_],
     'tol':[epsilon_stop],
     'C':np.logspace(0, 4, 10)},
    {'penalty': ['l2'], 
     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
     'class_weight':['none', 'balanced'], 
     'multi_class': ['auto','ovr'], 
     'max_iter':[iter_],
     'tol':[epsilon_stop],
     'C':np.logspace(0, 4, 10)},
    {'penalty': ['none'], 
     'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 
     'class_weight':['none', 'balanced'], 
     'multi_class': ['auto','ovr'], 
     'max_iter':[iter_],
     'tol':[epsilon_stop],
     'C':np.logspace(0, 4, 10)},
]

## model ваша модель логистической регрессии
gridsearch = GridSearchCV(logreg, param_grid, scoring='f1', n_jobs=-1, cv=5)
gridsearch.fit(X_train, y_train)
model = gridsearch.best_estimator_

##печатаем параметры
best_parameters = model.get_params()
for param_name in sorted(best_parameters.keys()):
        print('\t%s: %r' % (param_name, best_parameters[param_name]))'''
print('C: 59.94842503189409\n',
      'class_weight: balanced\n'
      'dual: False\n',
      'fit_intercept: True\n',
      'intercept_scaling: 1\n',
      'l1_ratio: None\n',
      'max_iter: 50\n',
      'multi_class: ovr\n',
      'n_jobs: None\n',
      'penalty: l2\n',
      'random_state: None\n',
      'solver: sag\n',
      'tol: 0.001\n',
      'verbose: 0\n',
      'warm_start: False\n')

In [176]:
#Let's teach model on all data
logreg.fit(X, y)

In [177]:
predict_submission = logreg.predict(test_data)

In [178]:
sample_submission['default'] = predict_submission
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(10)

In [180]:
!kaggle competitions submit -c sf-scoring -f ssubmission.csv -m "Artjom Andruk"
# !kaggle competitions submit your-competition-name -f submission.csv -m 'My submission message'