In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Fraud_check (1).csv')
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


## EDA

In [3]:
df.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [4]:
df.shape

(600, 6)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [6]:
df[df.duplicated()]

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban


In [7]:
df.duplicated().sum()

0

## Encoding

In [8]:
import category_encoders as ce

In [9]:
Ordinal = ce.OrdinalEncoder(cols = ['Undergrad','Marital.Status','Urban'])
Ordinal

In [10]:
df1 = Ordinal.fit_transform(df)

In [11]:
df1.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,1,1,68833,50047,10,1
1,2,2,33700,134075,18,1
2,1,3,36925,160205,30,1
3,2,1,50190,193264,15,1
4,1,3,81002,27533,28,2


In [12]:
Taxable_Income = []
for value in df1["Taxable.Income"]:
    if value<= 30000:
        Taxable_Income.append("Risky")
    else:
        Taxable_Income.append("Good")

df1["Taxable_Income"]= Taxable_Income

In [13]:
df1.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Taxable_Income
0,1,1,68833,50047,10,1,Good
1,2,2,33700,134075,18,1,Good
2,1,3,36925,160205,30,1,Good
3,2,1,50190,193264,15,1,Good
4,1,3,81002,27533,28,2,Good


In [14]:
df1 = df1.drop(columns='Taxable.Income')
df1.head()

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Taxable_Income
0,1,1,50047,10,1,Good
1,2,2,134075,18,1,Good
2,1,3,160205,30,1,Good
3,2,1,193264,15,1,Good
4,1,3,27533,28,2,Good


In [15]:
df1.Taxable_Income.unique()

array(['Good', 'Risky'], dtype=object)

In [16]:
df1.Taxable_Income.value_counts()

Good     476
Risky    124
Name: Taxable_Income, dtype: int64

In [17]:
x = df1.iloc[:,:-1]
y = df1['Taxable_Income']

## Splitting the data

In [18]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=1)

In [19]:
#Individual Models

lg = LogisticRegression()
lg.fit(xtrain,ytrain)
ypred = lg.predict(xtest)

print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

        Good       0.81      1.00      0.89        97
       Risky       0.00      0.00      0.00        23

    accuracy                           0.81       120
   macro avg       0.40      0.50      0.45       120
weighted avg       0.65      0.81      0.72       120



In [20]:
print(lg.score(xtrain,ytrain))
print(lg.score(xtest,ytest))

0.7895833333333333
0.8083333333333333


In [21]:
#prediction function
def predict(model):
    model = model.fit(xtrain,ytrain)
    ypred = model.predict(xtest)
    
    print(classification_report(ytest,ypred))
    print(model.score(xtrain,ytrain))
    print(model.score(xtest,ytest))

In [22]:
predict(LogisticRegression())

              precision    recall  f1-score   support

        Good       0.81      1.00      0.89        97
       Risky       0.00      0.00      0.00        23

    accuracy                           0.81       120
   macro avg       0.40      0.50      0.45       120
weighted avg       0.65      0.81      0.72       120

0.7895833333333333
0.8083333333333333


In [23]:
#Decision Tree, Random forest, KNN

In [24]:
predict(DecisionTreeClassifier())

              precision    recall  f1-score   support

        Good       0.81      0.70      0.75        97
       Risky       0.19      0.30      0.24        23

    accuracy                           0.62       120
   macro avg       0.50      0.50      0.49       120
weighted avg       0.69      0.62      0.65       120

1.0
0.625


In [25]:
predict(KNeighborsClassifier())

              precision    recall  f1-score   support

        Good       0.82      0.93      0.87        97
       Risky       0.30      0.13      0.18        23

    accuracy                           0.78       120
   macro avg       0.56      0.53      0.53       120
weighted avg       0.72      0.78      0.74       120

0.8104166666666667
0.775


In [26]:
predict(BaggingClassifier(DecisionTreeClassifier()))

              precision    recall  f1-score   support

        Good       0.80      0.95      0.87        97
       Risky       0.00      0.00      0.00        23

    accuracy                           0.77       120
   macro avg       0.40      0.47      0.43       120
weighted avg       0.65      0.77      0.70       120

0.9604166666666667
0.7666666666666667


In [27]:
predict(BaggingClassifier(KNeighborsClassifier()))

              precision    recall  f1-score   support

        Good       0.82      0.94      0.88        97
       Risky       0.33      0.13      0.19        23

    accuracy                           0.78       120
   macro avg       0.58      0.53      0.53       120
weighted avg       0.73      0.78      0.74       120

0.8083333333333333
0.7833333333333333


In [28]:
predict(RandomForestClassifier())

              precision    recall  f1-score   support

        Good       0.80      0.95      0.87        97
       Risky       0.00      0.00      0.00        23

    accuracy                           0.77       120
   macro avg       0.40      0.47      0.43       120
weighted avg       0.65      0.77      0.70       120

1.0
0.7666666666666667
