In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import preprocessing
from sklearn import datasets

In [2]:
#Read the data set
data=pd.read_csv("Fraud_check.csv")
data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


# EDA

In [3]:
data.columns

Index(['Undergrad', 'Marital.Status', 'Taxable.Income', 'City.Population',
       'Work.Experience', 'Urban'],
      dtype='object')

In [4]:
data.shape

(600, 6)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [6]:
data.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [7]:
data.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [8]:
data.head(3)

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES


In [9]:
data.tail(3)

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO
599,NO,Divorced,96519,158137,16,NO


In [10]:
data.sample(1)

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
309,YES,Single,31521,123761,20,YES


In [11]:
import warnings
warnings.filterwarnings("ignore")

# Data preprocessing

In [12]:
data=data.rename({'Marital.Status':'maritalstatus','Taxable.Income':'taxableincome','City.Population':'citypopulation','Work.Experience':'workexp','Undergrad':'undergrad','Urban':'urban'},axis=1)
data.columns

Index(['undergrad', 'maritalstatus', 'taxableincome', 'citypopulation',
       'workexp', 'urban'],
      dtype='object')

In [13]:
data['maritalstatus'].value_counts()

Single      217
Married     194
Divorced    189
Name: maritalstatus, dtype: int64

In [14]:
data['undergrad'].value_counts()

YES    312
NO     288
Name: undergrad, dtype: int64

In [15]:
data['urban'].value_counts()

YES    302
NO     298
Name: urban, dtype: int64

In [17]:
data=pd.get_dummies(data)
data.head(2)

Unnamed: 0,taxableincome,citypopulation,workexp,undergrad_NO,undergrad_YES,maritalstatus_Divorced,maritalstatus_Married,maritalstatus_Single,urban_NO,urban_YES
0,68833,50047,10,1,0,0,0,1,0,1
1,33700,134075,18,0,1,1,0,0,0,1


In [19]:
data['taxable_category'] = pd.cut(x = data['taxableincome'], bins = [10002,30000,99620], labels = ['Risky', 'Good'])
data

Unnamed: 0,taxableincome,citypopulation,workexp,undergrad_NO,undergrad_YES,maritalstatus_Divorced,maritalstatus_Married,maritalstatus_Single,urban_NO,urban_YES,taxable_category
0,68833,50047,10,1,0,0,0,1,0,1,Good
1,33700,134075,18,0,1,1,0,0,0,1,Good
2,36925,160205,30,1,0,0,1,0,0,1,Good
3,50190,193264,15,0,1,0,0,1,0,1,Good
4,81002,27533,28,1,0,0,1,0,1,0,Good
...,...,...,...,...,...,...,...,...,...,...,...
595,76340,39492,7,0,1,1,0,0,0,1,Good
596,69967,55369,2,0,1,1,0,0,0,1,Good
597,47334,154058,0,1,0,1,0,0,0,1,Good
598,98592,180083,17,0,1,0,1,0,1,0,Good


In [20]:
data['taxable_category'].value_counts()

Good     476
Risky    124
Name: taxable_category, dtype: int64

In [22]:
df = data.drop('taxableincome', axis = 1)
df

Unnamed: 0,citypopulation,workexp,undergrad_NO,undergrad_YES,maritalstatus_Divorced,maritalstatus_Married,maritalstatus_Single,urban_NO,urban_YES,taxable_category
0,50047,10,1,0,0,0,1,0,1,Good
1,134075,18,0,1,1,0,0,0,1,Good
2,160205,30,1,0,0,1,0,0,1,Good
3,193264,15,0,1,0,0,1,0,1,Good
4,27533,28,1,0,0,1,0,1,0,Good
...,...,...,...,...,...,...,...,...,...,...
595,39492,7,0,1,1,0,0,0,1,Good
596,55369,2,0,1,1,0,0,0,1,Good
597,154058,0,1,0,1,0,0,0,1,Good
598,180083,17,0,1,0,1,0,1,0,Good


In [23]:
x = df.drop('taxable_category', axis = 1)
y = df['taxable_category']

In [24]:
x

Unnamed: 0,citypopulation,workexp,undergrad_NO,undergrad_YES,maritalstatus_Divorced,maritalstatus_Married,maritalstatus_Single,urban_NO,urban_YES
0,50047,10,1,0,0,0,1,0,1
1,134075,18,0,1,1,0,0,0,1
2,160205,30,1,0,0,1,0,0,1
3,193264,15,0,1,0,0,1,0,1
4,27533,28,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...
595,39492,7,0,1,1,0,0,0,1
596,55369,2,0,1,1,0,0,0,1
597,154058,0,1,0,1,0,0,0,1
598,180083,17,0,1,0,1,0,1,0


In [25]:
y

0      Good
1      Good
2      Good
3      Good
4      Good
       ... 
595    Good
596    Good
597    Good
598    Good
599    Good
Name: taxable_category, Length: 600, dtype: category
Categories (2, object): ['Risky' < 'Good']

# Test Train splitting

In [26]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [27]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(402, 9)
(198, 9)
(402,)
(198,)


# Random Forest Classification

# Bagging and Boosting Classification

In [28]:
from sklearn.ensemble import  AdaBoostClassifier,BaggingClassifier,RandomForestClassifier

In [36]:
models=[RandomForestClassifier(n_estimators=200,max_samples=0.8),
        BaggingClassifier(n_estimators=200,max_samples=0.8),
       AdaBoostClassifier(n_estimators=200,random_state=42),
       ]

In [37]:
for model in models:
    model.fit (x_train,y_train)
    pred=model.predict(x_test)
    print(model)
    print("===="*20)
    print(classification_report(y_test,pred))

RandomForestClassifier(max_samples=0.8, n_estimators=200)
              precision    recall  f1-score   support

        Good       0.78      0.93      0.85       157
       Risky       0.08      0.02      0.04        41

    accuracy                           0.74       198
   macro avg       0.43      0.48      0.44       198
weighted avg       0.64      0.74      0.68       198

BaggingClassifier(max_samples=0.8, n_estimators=200)
              precision    recall  f1-score   support

        Good       0.78      0.95      0.86       157
       Risky       0.00      0.00      0.00        41

    accuracy                           0.75       198
   macro avg       0.39      0.47      0.43       198
weighted avg       0.62      0.75      0.68       198

AdaBoostClassifier(n_estimators=200, random_state=42)
              precision    recall  f1-score   support

        Good       0.79      0.96      0.87       157
       Risky       0.22      0.05      0.08        41

    accuracy     