In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import matplotlib as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline

In [3]:
credit_risk_df = pd.read_csv('/kaggle/input/credit-risk/original.csv')

In [4]:
credit_risk_df.info()

In [5]:
credit_risk_df.describe()

In [6]:
fig = px.histogram(credit_risk_df, 
                   x='income',
                   marginal='box', 
                   nbins=47, 
                   title='Income Distribution')
fig.update_layout(bargap=0.1)
fig.show()

In [7]:
px.scatter(credit_risk_df,x='age',y='loan',title='Age vs Loan')

In [8]:
px.histogram(credit_risk_df,x='income',y='default')

In [9]:
fig = px.histogram(credit_risk_df, 
                   x='loan', 
                   marginal='box', 
                   nbins=47, 
                   title='Distribution of Loan')
fig.update_layout(bargap=0.1)
fig.show()

In [10]:
fig = px.histogram(credit_risk_df, x='income',y='loan', color='default', title='income vs loan')
fig.update_layout(bargap=0.1)
fig.show()

In [11]:
# Identify the input columns (a list of column names)
input_cols = list(credit_risk_df.columns)[1:-1]
target_col = list(credit_risk_df.columns)[-1]
print(input_cols)
print(target_col)

In [12]:
inputs_df = credit_risk_df[input_cols].copy()
targets = credit_risk_df[target_col]

In [13]:
inputs_df

In [14]:
targets

In [15]:
credit_risk_df.info()

In [16]:
inputs_df.isnull().sum()

In [17]:
from sklearn.impute import SimpleImputer

In [18]:
imputer = SimpleImputer(strategy='mean')

In [19]:
imputer.fit(inputs_df[['age']])

In [20]:
inputs_df[['age']] = imputer.transform(inputs_df[['age']])
inputs_df.info()

In [21]:
inputs_df.describe().loc[['min', 'max']]

In [22]:
from sklearn.preprocessing import MinMaxScaler

In [23]:
scaler =MinMaxScaler()

In [24]:
scaler.fit(inputs_df[input_cols])

In [25]:
inputs_df[input_cols] = scaler.transform(inputs_df[input_cols])

In [26]:
inputs_df

In [27]:
from sklearn.model_selection import train_test_split

In [29]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs_df, 
                                                                        targets, 
                                                                        test_size=0.25, 
                                                                        random_state=42)

In [30]:
train_inputs

In [41]:
train_targets.value_counts()

In [32]:
val_inputs

In [33]:
val_targets

In [34]:
from sklearn.linear_model import LogisticRegression

In [42]:
model = LogisticRegression(class_weight='balanced')

In [43]:
model.fit(train_inputs,train_targets)

In [44]:
from sklearn.metrics import mean_squared_error

In [46]:
train_pred = model.predict(train_inputs)

In [84]:
train_rmse = mean_squared_error(train_pred,train_targets)
print("RMSE for training data is {:.2f} %".format(train_rmse*100))

In [85]:
val_pred = model.predict(val_inputs)

In [86]:
val_rmse = mean_squared_error(val_pred,val_targets)
print("RMSE for test data is {:.2f} %".format(val_rmse*100))

In [98]:
weights = model.coef_
weights

In [99]:
from sklearn.metrics import accuracy_score

In [101]:
accuracy_score(train_targets, train_pred)

In [102]:
from sklearn.metrics import confusion_matrix

In [103]:
confusion_matrix(train_targets, train_pred, normalize='true')

In [126]:
def predict_and_plot(inputs, targets, name=''):
    preds = model.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    cf = confusion_matrix(targets, preds, normalize='true')
    #plt.figure()
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));    
    return preds

In [127]:
train_preds = predict_and_plot(train_inputs, train_targets, 'Train')

In [128]:
val_preds = predict_and_plot(val_inputs, val_targets, 'Validation')

In [131]:
import joblib

credit_risk_model = {'model': model, 'imputer': imputer, 'scaler': scaler, 
               'input_cols': input_cols, 'target_col': target_col}
joblib.dump(credit_risk_model, 'credit_risk_model.joblib')
credit_risk2 = joblib.load('credit_risk_model.joblib')