# Credit Card Fraud Detection

## Importing the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# modeling 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, auc, roc_curve, \
    precision_recall_curve, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


In [3]:
data = pd.read_csv('Credit_card.csv')
data.head()

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2
3,5009749,F,Y,N,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1548 entries, 0 to 1547
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Ind_ID           1548 non-null   int64  
 1   GENDER           1541 non-null   object 
 2   Car_Owner        1548 non-null   object 
 3   Propert_Owner    1548 non-null   object 
 4   CHILDREN         1548 non-null   int64  
 5   Annual_income    1525 non-null   float64
 6   Type_Income      1548 non-null   object 
 7   EDUCATION        1548 non-null   object 
 8   Marital_status   1548 non-null   object 
 9   Housing_type     1548 non-null   object 
 10  Birthday_count   1526 non-null   float64
 11  Employed_days    1548 non-null   int64  
 12  Mobile_phone     1548 non-null   int64  
 13  Work_Phone       1548 non-null   int64  
 14  Phone            1548 non-null   int64  
 15  EMAIL_ID         1548 non-null   int64  
 16  Type_Occupation  1060 non-null   object 
 17  Family_Members

In [5]:
data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Ind_ID,1548.0,,,,5078920.351421,41717.587742,5008827.0,5045069.75,5078841.5,5115673.0,5150412.0
GENDER,1541.0,2.0,F,973.0,,,,,,,
Car_Owner,1548.0,2.0,N,924.0,,,,,,,
Propert_Owner,1548.0,2.0,Y,1010.0,,,,,,,
CHILDREN,1548.0,,,,0.412791,0.776691,0.0,0.0,0.0,1.0,14.0
Annual_income,1525.0,,,,191399.32623,113252.997656,33750.0,121500.0,166500.0,225000.0,1575000.0
Type_Income,1548.0,4.0,Working,798.0,,,,,,,
EDUCATION,1548.0,5.0,Secondary / secondary special,1031.0,,,,,,,
Marital_status,1548.0,5.0,Married,1049.0,,,,,,,
Housing_type,1548.0,6.0,House / apartment,1380.0,,,,,,,


In [6]:
data.isnull().sum()

Ind_ID               0
GENDER               7
Car_Owner            0
Propert_Owner        0
CHILDREN             0
Annual_income       23
Type_Income          0
EDUCATION            0
Marital_status       0
Housing_type         0
Birthday_count      22
Employed_days        0
Mobile_phone         0
Work_Phone           0
Phone                0
EMAIL_ID             0
Type_Occupation    488
Family_Members       0
dtype: int64

## Exploratory Data Analysis

### Defining functions

In [12]:
def histogram_boxplot(data, feature, figsize=(15, 10), kde=False, bins=50):
    """ 
    Boxplot and histogram combined
    
    data : dataframe
    feature : dataframe column
    figsize : size of figure
    ked : kdeplot boolean
    bins : number of bins
    """

    figs, (ax_box, ax_hist) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid
        sharex=True,  # Share x-axis
        figsize=figsize,
        dpi=100,
        gridspec_kw={"height_ratios": (0.25, 0.75)}  # Heights of the axes proportions
    )

    sns.boxplot(
        data=data,
        width=0.4,
        x=feature,
        ax=ax_box,
        showmeans=True,  # Show the arithmetic mean,
        color='red'
    )

    sns.histplot(
        data=data,
        x=feature,
        ax=ax_hist,
        kde=kde,  # kde plot
        bins=bins
    )

    # Add a line for the mean.
    ax_hist.axvline(
        data[feature].mean(),
        color='green',
        linestyle='--'
    )

    # Add a line for the median.
    ax_hist.axvline(
        data[feature].median(),
        color='black',
        linestyle='-'
    )

In [13]:
# function to create labeled barplots

def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top
    
    data : dataframe
    feature : dataframe column
    perc : True for percentage, False for count
    n : how many categories to show
    """

    total = len(data[feature])
    count = data[feature].nunique()

    if n is None:
        plt.figure(figsize=(count + 2, 6), dpi=100)
    else:
        plt.figure(figsize=(n + 2, 6), dpi=100)

    ax = sns.countplot(data=data, x=feature,
                       palette='Paired',
                       order=data[feature].value_counts().index[:n]
                       )

    for p in ax.patches:
        if perc == True:
            lable = f"{p.get_height() / total * 100:.2f}%\n"
        else:
            lable = p.get_height()

        x = p.get_x() + p.get_width() / 2
        y = p.get_height()

        ax.annotate(lable, (x, y), ha='center', va='center', fontsize=11, color='black', xytext=(0, 5),
                    textcoords='offset points')

    plt.xlabel(feature, fontsize=14, weight='bold')
    plt.ylabel('Count', fontsize=14, weight='bold')

    plt.show()

In [14]:
def stacked_barplot(data, predictor, target):
    """
    Print the category counts and plot a stacked bar chart
    
    data : dataframe
    predictor : independent variable
    target : target variable
    """

    count = data[predictor].nunique()
    # count is the number of categories in the predictor variable

    sorter = data[target].value_counts().index[-1]
    # sorter is the category with the lowest number of bookings

    tab_1 = pd.crosstab(data[predictor],
                        data[target],
                        margins=True).sort_values(by=sorter,
                                                  ascending=False)
    # tab_1 is the cross tabulation table of the predictor and target variables

    print(tab_1)
    print('-' * 60)

    # plt.figure(figsize=(12, 8), dpi=100)

    tab = pd.crosstab(data[predictor],
                      data[target],
                      normalize='index').sort_values(by=sorter,
                                                     ascending=False)
    # tab is the normalized cross tabulation table of the predictor and target variables, to build the percentage

    fig, ax = plt.subplots(figsize=(count + 5, 5), dpi=100)

    tab.plot(kind="bar", stacked=True, ax=ax)

    # plot a stacked bar chart

    plt.legend(loc='lower left', frameon=False)
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.xticks(rotation=0, fontsize=12)

    plt.show()

In [15]:
# function to plot distribution of the data

def distribution_plot_wrt_target(data, predictor, target):
    fig, axes = plt.subplots(2, 2, figsize=(12, 10), dpi=100)
    # create a figure with 2 rows and 2 columns

    target_unique = data[target].unique()
    # get the unique values of the target variable

    axes[0, 0].set_title('Distribution of target for target =' + str(target_unique[0]))
    # set the title of the plot

    sns.histplot(
        data=data[data[target] == target_unique[0]],
        x=predictor,
        kde=True,
        ax=axes[0, 0],
        color='teal',
        stat='density'
    )
    # plot distribution of the predictor variable for target = 0

    axes[0, 1].set_title('Distribution of target for target =' + str(target_unique[1]))
    sns.histplot(
        data=data[data[target] == target_unique[1]],
        x=predictor,
        kde=True,
        ax=axes[0, 1],
        color='orange',
        stat='density'
    )
    # plot distribution of the predictor variable for target = 1

    axes[1, 0].set_title('Boxplot w.r.t target')
    sns.boxplot(
        data=data,
        x=target,
        y=predictor,
        ax=axes[1, 0],
        palette='gist_rainbow'
    )
    # plot boxplot of the predictor variable w.r.t target variable

    axes[1, 1].set_title('Boxplot w.r.t target without outliers')
    sns.boxplot(
        data=data,
        x=target,
        y=predictor,
        ax=axes[1, 1],
        showfliers=False,
        palette='gist_rainbow')
    # plot boxplot of the predictor variable w.r.t target variable without outliers

    plt.tight_layout()
    plt.show()


### Univariant Analysis