In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore', '.*do not.*', )

In [2]:
data = pd.read_csv('Credit Card Fraud.csv')

In [3]:
data

Unnamed: 0,over_draft,credit_usage,credit_history,purpose,current_balance,Average_Credit_Balance,employment,location,personal_status,other_parties,...,property_magnitude,cc_age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12,existing paid,furniture/equipment,1736,<100,4<=X<7,3,female div/dep/mar,none,...,real estate,31,none,own,1,unskilled resident,1,none,yes,good
996,<0,30,existing paid,used car,3857,<100,1<=X<4,4,male div/sep,none,...,life insurance,40,none,own,1,high qualif/self emp/mgmt,1,yes,yes,good
997,no checking,12,existing paid,radio/tv,804,<100,>=7,4,male single,none,...,car,38,none,own,1,skilled,1,none,yes,good
998,<0,45,existing paid,radio/tv,1845,<100,1<=X<4,4,male single,none,...,no known property,23,none,for free,1,skilled,1,yes,yes,bad


### Exploratory Data Analysis

In [108]:
ds = data.copy()

In [109]:
# Clean the Ovr_Draft feature
ds['over_draft'] = ds['over_draft'].replace({'no checking': 'unchecked', '<0': 'no', '0<=X<200': 'no', '>=200' : 'yes'})

# Clean the Credit History feature
ds['credit_history'] = ds['credit_history'].replace({'critical/other existing credit':'critical','delayed previously':'delayed',
                        'no credits/all paid': 'all paid', 'existing paid': 'all paid'})

# Clean the Average_Credit_Balance feature
ds['Average_Credit_Balance'] = ds['Average_Credit_Balance'].replace({'<100':'below_100', 'no known savings': 'no_savings',
                           '100<=X<500': 'below_500', '500<=X<1000': 'below_1000', '>=1000':'above_1000' })

# Clean the employment feature
ds['employment'] = ds['employment'].replace({'1<=X<4':'employed','>=7':'employed', '4<=X<7': 'employed', '<1':'unemployed'}) 


# Split the personal_status feature into two columns
ds[['gender', 'marital_status']] = ds['personal_status'].str.split(pat = " ",expand = True)
# Drop Marital status as no clear feature could be gotten
ds.drop(['marital_status'],inplace =True, axis = 1)


ds['job'] = ds['job'].replace({'unskilled resident': 'unskilled', 'high qualif/self emp/mgmt': 'self_employed',
                                'unemp/unskilled non res': 'unemeployed'})

# Rename columns
ds.rename({'cc_age': 'age'},axis = 1, inplace = True)

# Change the age feature location
gender_col = ds.pop('gender')
ds.insert(9, 'gender', gender_col)

ds.drop(['personal_status', 'location'], inplace = True, axis = 1)
ds

Unnamed: 0,over_draft,credit_usage,credit_history,purpose,current_balance,Average_Credit_Balance,employment,gender,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,no,6,critical,radio/tv,1169,no_savings,employed,male,none,4,real estate,67,none,own,2,skilled,1,yes,yes,good
1,no,48,all paid,radio/tv,5951,below_100,employed,female,none,2,real estate,22,none,own,1,skilled,1,none,yes,bad
2,unchecked,12,critical,education,2096,below_100,employed,male,none,3,real estate,49,none,own,1,unskilled,2,none,yes,good
3,no,42,all paid,furniture/equipment,7882,below_100,employed,male,guarantor,4,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,no,24,delayed,new car,4870,below_100,employed,male,none,4,no known property,53,none,for free,2,skilled,2,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,unchecked,12,all paid,furniture/equipment,1736,below_100,employed,female,none,4,real estate,31,none,own,1,unskilled,1,none,yes,good
996,no,30,all paid,used car,3857,below_100,employed,male,none,4,life insurance,40,none,own,1,self_employed,1,yes,yes,good
997,unchecked,12,all paid,radio/tv,804,below_100,employed,male,none,4,car,38,none,own,1,skilled,1,none,yes,good
998,no,45,all paid,radio/tv,1845,below_100,employed,male,none,4,no known property,23,none,for free,1,skilled,1,yes,yes,bad


In [6]:
ds.describe()

Unnamed: 0,credit_usage,current_balance,residence_since,age,existing_credits,num_dependents
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.903,3271.258,2.845,35.546,1.407,1.155
std,12.058814,2822.736876,1.103718,11.375469,0.577654,0.362086
min,4.0,250.0,1.0,19.0,1.0,1.0
25%,12.0,1365.5,2.0,27.0,1.0,1.0
50%,18.0,2319.5,3.0,33.0,1.0,1.0
75%,24.0,3972.25,4.0,42.0,2.0,1.0
max,72.0,18424.0,4.0,75.0,4.0,2.0


### Explanatory Analysis

###### UniVariate Analysis

In [54]:
dc = px.histogram(data, x='credit_history',
                         title='Customers Credit History', height=450,
                         color_discrete_sequence=['#03DAC5'],
                         )
dc.update_xaxes(categoryorder='total descending')

dc.update_layout(margin=dict(t=70, b=10, l=70, r=40),
                        # hovermode="x unified",
                        # xaxis_tickangle=360,
                        xaxis_title=' Credit History ', yaxis_title=" Counts ",
                        plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
                        title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                        font=dict(color='#8a8d93'),
                        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))
dc

Here we see that most of the customers have loans that are paid <br>
Lesser than a 100 has previously delayed loan <br>
Very few has all their loans paid <br>
Less than 50 has no credits at all

In [55]:
pup = px.histogram(data, x='purpose',
                         title='Purpose for Collection of Loan', height=400,
                         color_discrete_sequence=['#03DAC5'],
                         )
pup.update_xaxes(categoryorder='total descending')

pup.update_layout(margin=dict(t=70, b=10, l=70, r=40),
                        hovermode="x unified",
                        xaxis_tickangle=360,
                        xaxis_title=' ', yaxis_title=" ",
                        plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
                        title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                        font=dict(color='#8a8d93'),
                        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))
pup

In [86]:
# Bin Current Balance to have a range

dz = data.copy()
dz['binned'] = pd.cut(x = data['current_balance'], bins = [1, 500, 1000, 3000, 5000, 8000, 10000, 13000, 15000, 18500], 
                    labels = ['1 - 500', '500 - 1000', '1000-3000', '3000-5000', '5000-8000', '8000-10000',
                     '10000-13000', '13000-15000', '15000-185000'])

pup = px.histogram(data, x= dz['binned'],
                         title='Current Balance of Customers', height=400,
                         color_discrete_sequence=['#03DAC5'],
                         )
pup.update_xaxes(categoryorder='total descending')

pup.update_layout(margin=dict(t=70, b=10, l=70, r=40),
                        hovermode="x unified",
                        xaxis_tickangle=360,
                        xaxis_title='Balance', yaxis_title="Count ",
                        plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
                        # title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                        font=dict(color='#9a8d93'),
                        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))
pup

High income customers on the list are very low
Customers earning between 1000-3000 are most
Extremely low incme earners o the list are very low

In [106]:
pup = px.histogram(ds, x= 'employment',
                         title='Empployment Status', height=300, width = 350)
pup

In [113]:
pup = px.histogram(ds, x= 'property_magnitude',
                         title='Empployment Status', height=400, width = 350)
pup

Most of customer invest in cars than they do other investment> Followed by real estate

In [117]:
pup = px.histogram(ds, x= 'job',
                         title='Empployment Status', height=400, width = 450)
pup

larger number of the customers are skilled workers, followed by unskilled workers, self employed, then unemployed

In [118]:
data.columns

Index(['over_draft', 'credit_usage', 'credit_history', 'purpose',
       'current_balance', 'Average_Credit_Balance', 'employment', 'location',
       'personal_status', 'other_parties', 'residence_since',
       'property_magnitude', 'cc_age', 'other_payment_plans', 'housing',
       'existing_credits', 'job', 'num_dependents', 'own_telephone',
       'foreign_worker', 'class'],
      dtype='object')

##### BiVariate Analysis

In [120]:
ds['housing'].value_counts()

own         713
rent        179
for free    108
Name: housing, dtype: int64