In [17]:
from tqdm.auto import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mutual_info_score
from sklearn.metrics import accuracy_score

from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score



In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AER_credit_card_data.csv'

!wget $data

--2022-10-05 19:24:04--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AER_credit_card_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 73250 (72K) [text/plain]
Saving to: ‘AER_credit_card_data.csv’


2022-10-05 19:24:05 (992 KB/s) - ‘AER_credit_card_data.csv’ saved [73250/73250]



In [5]:
df = pd.read_csv('AER_credit_card_data.csv')

df.T.head(100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1309,1310,1311,1312,1313,1314,1315,1316,1317,1318
card,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,...,yes,yes,no,no,no,yes,no,yes,yes,yes
reports,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,5,0,0,0
age,37.66667,33.25,33.66667,30.5,32.16667,23.25,27.91667,29.16667,37.0,28.41667,...,40.16667,30.58333,39.33333,22.58333,30.33333,33.58333,23.91667,40.58333,32.83333,48.25
income,4.52,2.42,4.5,2.54,9.7867,2.5,3.96,2.37,3.8,3.2,...,2.0,2.512,2.3004,3.86,2.18,4.566,3.192,4.6,3.7,3.7
share,0.03327,0.005217,0.004156,0.065214,0.067051,0.044438,0.012576,0.076434,0.245628,0.01978,...,0.254827,0.002627,0.000522,0.000311,0.00055,0.002146,0.000376,0.026513,0.008999,0.111619
expenditure,124.9833,9.854167,15.0,137.8692,546.5033,91.99667,40.83333,150.79,777.8217,52.58,...,424.6292,4.583333,0.0,0.0,0.0,7.333333,0.0,101.2983,26.99667,344.1575
owner,yes,no,yes,no,yes,no,no,yes,yes,no,...,yes,no,no,no,no,yes,no,yes,no,yes
selfemp,no,no,no,no,no,no,no,no,no,no,...,no,yes,yes,no,yes,no,no,no,yes,no
dependents,3,3,4,0,2,0,2,0,0,0,...,1,3,0,1,0,0,3,2,0,2
months,54,34,58,25,64,54,7,77,97,65,...,121,36,46,36,13,94,12,1,60,2


#### EDA - Data Prep

In [6]:
df.dtypes

card            object
reports          int64
age            float64
income         float64
share          float64
expenditure    float64
owner           object
selfemp         object
dependents       int64
months           int64
majorcards       int64
active           int64
dtype: object

In [7]:

columns = df.columns

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ','_')


In [8]:
numerical = [c for c in columns if c not in categorical_columns]
categorical = [c for c in categorical_columns if c != 'card']

In [11]:
for c in numerical:
    print(c,':', df[c].dtype)

reports : int64
age : float64
income : float64
share : float64
expenditure : float64
dependents : int64
months : int64
majorcards : int64
active : int64


In [12]:
for c in categorical:
    print(c,':',df[c].dtype)

owner : object
selfemp : object


In [13]:
label = 'card'
df[label] = (df[label] == 'yes').astype(int)

df[label].value_counts() #class-imbalance

1    1023
0     296
Name: card, dtype: int64

In [14]:
features = ["reports", "age", "income", "share", "expenditure", "dependents", "months", "majorcards", "active", "owner", "selfemp"]
len(features), len(categorical), len(numerical)

(11, 2, 9)

#### Training , Validation & Testing Data

In [15]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

y_train = df_train.card.values
y_val = df_val.card.values
y_test = df_test.card.values

del df_train['card']
del df_val['card']
del df_test['card']

#### Question 1

ROC AUC could also be used to evaluate feature importance of numerical variables.

Let's do that

* For each numerical variable, use it as score and compute AUC with the card variable.
* Use the training dataset for that.

In [19]:
for c in numerical:
    auc_score = roc_auc_score(y_train, df_train[c].values)
    if auc_score < 0.5:
        auc_score = roc_auc_score(y_train, -df_train[c].values)

    print(f'numeric feature : {c}, AUC : {auc_score}')

numeric feature : reports, AUC : 0.7166629860689376
numeric feature : age, AUC : 0.5240020979407055
numeric feature : income, AUC : 0.5908049467233478
numeric feature : share, AUC : 0.989183643423692
numeric feature : expenditure, AUC : 0.991042345276873
numeric feature : dependents, AUC : 0.5327757227773791
numeric feature : months, AUC : 0.5294217780967629
numeric feature : majorcards, AUC : 0.5343859842838476
numeric feature : active, AUC : 0.6043173411362006


#### Training the Model

In [20]:
#one-hot encoding
dv  = DictVectorizer(sparse=False)
train_dict = df_train[numerical + categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)


In [21]:
#Training Logistic Regreassion Model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

In [22]:
#validation of the model
val_dict = df_val[numerical + categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_val_pred = model.predict_proba(X_val)[:,1]
churn_pred = (y_val_pred >= 0.5)

accuracy = round((y_val == churn_pred).mean(), 4)
accuracy

0.9697

In [23]:
accuracy_score(y_val, churn_pred)

0.9696969696969697

In [24]:
#The above accuracies would not matter as there is class imbalance

#### Question - 2

* What's the AUC of this model on the validation dataset? (round to 3 digits)

In [26]:
#AUC score of the model
round(roc_auc_score(y_val, y_val_pred),3)

0.995

#### Question - 4
