In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split 

import pickle
import random

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit, cross_val_score, learning_curve
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score



import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline


In [3]:
import warnings
warnings.simplefilter('ignore')

In [4]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [5]:
# input
DATASET_PATH = 'training_project_data.csv'
PREP_DATASET_PATH = 'training_project_data_prep.csv'

# output
TRAIN_FULL_PATH = 'training_project_train_full.csv'
TRAIN_PART_PATH = 'training_project_train_part_b.csv'
TEST_PART_PATH = 'training_project_test_part.csv'

SCALER_FILE_PATH = 'scaler.pkl'

In [7]:
df = pd.read_csv(PREP_DATASET_PATH)
df_base = pd.read_csv(DATASET_PATH)

TARGET_NAME = 'NEXT_MONTH_DEFAULT'
BASE_FEATURE_NAMES = df_base.columns.drop(TARGET_NAME).tolist()
NEW_FEATURE_NAMES = df.columns.drop([TARGET_NAME, 'ID'] + BASE_FEATURE_NAMES).tolist()

NUM_FEATURE_NAMES = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                     'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

CAT_FEATURE_NAMES = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

SELECTED_FEATURE_NAMES = NUM_FEATURE_NAMES + NEW_FEATURE_NAMES

for colname in CAT_FEATURE_NAMES:
    df[colname] = pd.Categorical(df[colname])
    
df[CAT_FEATURE_NAMES].dtypes

scaler = StandardScaler()

with open(SCALER_FILE_PATH, 'rb') as file:
    scaler = pickle.load( file)
    
df_norm = df.copy()
df_norm[NUM_FEATURE_NAMES] = scaler.fit_transform(df_norm[NUM_FEATURE_NAMES])

df = df_norm.copy()

X = df[SELECTED_FEATURE_NAMES]
y = df[TARGET_NAME]

X_s=X.copy()
X_s[NUM_FEATURE_NAMES] = scaler.transform(X[NUM_FEATURE_NAMES])

data_train, data_test, labels_train, labels_test = train_test_split(X_s[SELECTED_FEATURE_NAMES],y, test_size=0.3)

In [8]:
data_train.shape

(7000, 51)

Логистическая регрессия - Максимизация правдоподобия

In [9]:
Labels_train1 = (labels_train == 1)
Labels_test1 = (labels_test==1)

LRC = LogisticRegression()
LRC.fit(data_train, Labels_train1.astype(int))

# Test 
y_train_pred = LRC.predict(data_train)
y_test_pred= LRC.predict(data_test)

In [10]:
print('ошибка LRC = ',np.mean(np.abs(y_test_pred-Labels_test1.astype(int)))*100,'%')

ошибка LRC =  17.133333333333333 %


In [11]:


get_classification_report(Labels_train1.astype(int), y_train_pred, Labels_test1.astype(int), y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      5472
           1       0.67      0.35      0.46      1528

    accuracy                           0.82      7000
   macro avg       0.76      0.65      0.68      7000
weighted avg       0.80      0.82      0.80      7000

TEST

              precision    recall  f1-score   support

           0       0.84      0.96      0.90      2333
           1       0.72      0.38      0.50       667

    accuracy                           0.83      3000
   macro avg       0.78      0.67      0.70      3000
weighted avg       0.82      0.83      0.81      3000

CONFUSION MATRIX

col_0                  0    1
NEXT_MONTH_DEFAULT           
0                   2234   99
1                    415  252


SVM - максимизация зазора

In [12]:
from sklearn import svm

cSVM = svm.SVC()
cSVM.fit(data_train, Labels_train1.astype(int))

# Test 
predicteds = cSVM.predict(data_test)
# Test 
y_train_pred = cSVM.predict(data_train)
y_test_pred= cSVM.predict(data_test)

In [13]:
print('ошибка SVM = ',np.mean(np.abs(y_test_pred-Labels_test1.astype(int)))*100,'%')

ошибка SVM =  17.433333333333334 %


In [14]:

get_classification_report(Labels_train1.astype(int), y_train_pred, Labels_test1.astype(int), y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      5472
           1       0.68      0.32      0.43      1528

    accuracy                           0.82      7000
   macro avg       0.76      0.64      0.66      7000
weighted avg       0.80      0.82      0.79      7000

TEST

              precision    recall  f1-score   support

           0       0.84      0.96      0.90      2333
           1       0.72      0.35      0.47       667

    accuracy                           0.83      3000
   macro avg       0.78      0.66      0.68      3000
weighted avg       0.81      0.83      0.80      3000

CONFUSION MATRIX

col_0                  0    1
NEXT_MONTH_DEFAULT           
0                   2241   92
1                    431  236


In [35]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, scoring ='f1_micro',)
clf.fit(X_s, y)
#GridSearchCV(estimator=SVC(),     param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})