### Курсовой проект Бриленкова Ильи по курсу "Python для Data Science 2"

In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

import xgboost as xgb

import warnings
warnings.simplefilter('ignore')

In [2]:
data = pd.read_csv('course_project_train.csv')
data = data.reset_index()
data.head()

Unnamed: 0,index,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


### Точечные корректировки

In [3]:
data.loc[data['index'] == 1984, ['Maximum Open Credit', 'Current Loan Amount']] = data['Current Credit Balance']

In [4]:
data.loc[data['index'] == 2081, 'Maximum Open Credit'] = data['Current Loan Amount']

In [5]:
data = data.drop(['index'], axis=1)

### Заполнение пропущенных данных

In [6]:
def Annual_Income_filling(df):
    df['Annual Income'] = df['Annual Income'].fillna(1366392)

In [7]:
def Years_in_current_job_filling(df):
    df.loc[df['Years in current job'] == '< 1 year', 'Years in current job'] = 0.5
    df.loc[df['Years in current job'] == '1 year', 'Years in current job'] = 1
    df.loc[df['Years in current job'] == '2 years', 'Years in current job'] = 2
    df.loc[df['Years in current job'] == '3 years', 'Years in current job'] = 3
    df.loc[df['Years in current job'] == '4 years', 'Years in current job'] = 4
    df.loc[df['Years in current job'] == '5 years', 'Years in current job'] = 5
    df.loc[df['Years in current job'] == '6 years', 'Years in current job'] = 6
    df.loc[df['Years in current job'] == '7 years', 'Years in current job'] = 7
    df.loc[df['Years in current job'] == '8 years', 'Years in current job'] = 8
    df.loc[df['Years in current job'] == '9 years', 'Years in current job'] = 9
    df.loc[df['Years in current job'] == '10+ years', 'Years in current job'] = 10
    df['Years in current job'] = df['Years in current job'].fillna(5)

In [8]:
def Months_since_last_delinquent_filling(df):
    df['Months since last delinquent'] = df['Months since last delinquent'].fillna(33)

In [9]:
def Bankruptcies_filling(df):
    df['Bankruptcies'] = df['Bankruptcies'].fillna(0)

In [10]:
def Credit_Score_change(df):
    data['Credit Score'] = data['Credit Score'].fillna(600)
Credit_Score_change(data)

### Обработка выбросов

In [11]:
def Maximum_Open_Credit_change(df):
    df.loc[df['Maximum Open Credit'] == 0, 'Maximum Open Credit'] = 250000

In [12]:
def Current_Loan_Amount_change(df):
    data.loc[data['Current Loan Amount'] > data['Maximum Open Credit'], 'Current Loan Amount'] = data['Maximum Open Credit']

In [13]:
def Current_Credit_Balance_change(df):
    data.loc[data['Current Credit Balance'] < 10000, 'Current Credit Balance'] = data['Current Loan Amount']

In [14]:
def Monthly_Debt_change(df):
    data.loc[data['Monthly Debt'] < 200, 'Monthly Debt'] = data['Current Credit Balance'] / 10
    data.loc[data['Monthly Debt'] > data['Current Credit Balance'], 'Monthly Debt'] = data['Current Credit Balance']

### Запуск функций

In [15]:
def start_functions(df):
    Annual_Income_filling(df)
    Years_in_current_job_filling(df)
    Months_since_last_delinquent_filling(df)
    Bankruptcies_filling(df)
    Maximum_Open_Credit_change(df)
    Current_Loan_Amount_change(df)
    Current_Credit_Balance_change(df)
    Monthly_Debt_change(df)
    Credit_Score_change(df)

In [16]:
start_functions(data)

### Dummies

In [17]:
data = pd.get_dummies(data)

In [18]:
data.head()

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,...,Purpose_medical bills,Purpose_moving,Purpose_other,Purpose_renewable energy,Purpose_small business,Purpose_take a trip,Purpose_vacation,Purpose_wedding,Term_Long Term,Term_Short Term
0,482087.0,5.0,0.0,11.0,26.3,685960.0,1.0,33.0,1.0,685960.0,...,0,0,0,0,0,0,0,0,0,1
1,1025487.0,10.0,0.0,15.0,15.3,1181730.0,0.0,33.0,0.0,264968.0,...,0,0,0,0,0,0,0,0,1,0
2,751412.0,8.0,0.0,11.0,35.0,1182434.0,0.0,33.0,0.0,1182434.0,...,0,0,0,0,0,0,0,0,0,1
3,805068.0,6.0,0.0,8.0,22.5,147400.0,1.0,33.0,1.0,121396.0,...,0,0,0,0,0,0,0,0,0,1
4,776264.0,8.0,0.0,13.0,13.6,385836.0,1.0,33.0,0.0,125840.0,...,0,0,0,0,0,0,0,0,0,1


### Нормализация

In [19]:
feats = ['Annual Income', 'Years in current job', 'Tax Liens',
       'Number of Open Accounts', 'Years of Credit History',
       'Maximum Open Credit', 'Number of Credit Problems',
       'Months since last delinquent', 'Bankruptcies', 'Current Loan Amount',
       'Current Credit Balance', 'Monthly Debt', 'Credit Score',
       'Home Ownership_Have Mortgage',
       'Home Ownership_Home Mortgage', 'Home Ownership_Own Home',
       'Home Ownership_Rent', 'Purpose_business loan', 'Purpose_buy a car',
       'Purpose_buy house', 'Purpose_debt consolidation',
       'Purpose_educational expenses', 'Purpose_home improvements',
       'Purpose_major purchase', 'Purpose_medical bills', 'Purpose_moving',
       'Purpose_other', 'Purpose_small business',
       'Purpose_take a trip', 'Purpose_vacation', 'Purpose_wedding',
       'Term_Long Term', 'Term_Short Term']

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df_norm = data.copy()
df_norm[feats] = scaler.fit_transform(df_norm[feats])

data = df_norm.copy()

### Разбиение на train и test<a class="anchor" id="train_and_test"></a>

In [21]:
X = data[feats]
y = data['Credit Default']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25, random_state=42)

### Балансировка целевой переменной<a class="anchor" id="target_balancing"></a>

In [22]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1) 

In [23]:
TARGET_NAME = 'Credit Default'
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_balanced = balance_df_by_target(df_for_balancing, TARGET_NAME)
    
df_balanced[TARGET_NAME].value_counts()

0    4074
1    3102
Name: Credit Default, dtype: int64

In [24]:
X_train = df_balanced.drop(columns=TARGET_NAME)
y_train = df_balanced[TARGET_NAME]

### Построение модели

In [25]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [26]:
model_xgb = xgb.XGBClassifier(max_depth=1, n_estimators=300,min_child_weight=5,scale_pos_weight=1.15, random_state=42)
model_xgb.fit(X_train, y_train)

y_train_pred = model_xgb.predict(X_train)
y_test_pred = model_xgb.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

             precision    recall  f1-score   support

          0       0.71      0.82      0.76      4074
          1       0.70      0.55      0.62      3102

avg / total       0.70      0.70      0.70      7176

TEST

             precision    recall  f1-score   support

          0       0.80      0.80      0.80      1313
          1       0.53      0.53      0.53       562

avg / total       0.72      0.72      0.72      1875

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1045  268
1                264  298


###  Тестовые данные

In [27]:
data = pd.read_csv('course_project_test.csv')
data.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
0,Rent,,4 years,0.0,9.0,12.5,220968.0,0.0,70.0,0.0,debt consolidation,Short Term,162470.0,105906.0,6813.0,
1,Rent,231838.0,1 year,0.0,6.0,32.7,55946.0,0.0,8.0,0.0,educational expenses,Short Term,78298.0,46037.0,2318.0,699.0
2,Home Mortgage,1152540.0,3 years,0.0,10.0,13.7,204600.0,0.0,,0.0,debt consolidation,Short Term,200178.0,146490.0,18729.0,7260.0
3,Home Mortgage,1220313.0,10+ years,0.0,16.0,17.0,456302.0,0.0,70.0,0.0,debt consolidation,Short Term,217382.0,213199.0,27559.0,739.0
4,Home Mortgage,2340952.0,6 years,0.0,11.0,23.6,1207272.0,0.0,,0.0,debt consolidation,Long Term,777634.0,425391.0,42605.0,706.0


In [28]:
start_functions(data)
data = pd.get_dummies(data)

In [29]:
df_norm = data.copy()
df_norm[feats] = scaler.fit_transform(df_norm[feats])

data = df_norm.copy()

In [30]:
test_pred = model_xgb.predict(data[feats])

In [31]:
data['Credit Default'] = test_pred

In [None]:
data['Credit Default'].to_csv('IBrilenkov_predictions.csv', index=False, header=True)