In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")

In [3]:
rf=pd.read_csv("Fraud_check.csv")

In [4]:
rf

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


# EDA

In [5]:
rf.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [7]:
rf[rf['Taxable.Income']<= 30000]

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
10,NO,Single,29732,102602,19,YES
12,NO,Divorced,11794,148033,14,YES
16,YES,Divorced,24987,34551,29,NO
19,NO,Single,10987,126953,30,YES
21,YES,Divorced,14310,29106,7,YES
...,...,...,...,...,...,...
544,NO,Single,29916,133877,21,YES
557,YES,Married,12810,157421,10,NO
567,YES,Single,16316,129912,30,NO
575,NO,Divorced,10735,95975,13,YES


# Creating new column called Fraud_Check 

In [10]:
group_names = ['Risky', 'Good']
rf['Fraud_Check']=pd.cut(rf['Taxable.Income'], bins = [0,30000,1000000], labels = group_names)

In [12]:
for i in range(0,len(rf['Taxable.Income'])):
    if rf['Taxable.Income'][i]<=30000:
        rf['Fraud_Check'][i] = 'Risky'
    else:
        rf['Fraud_Check'][i] = 'Good'

In [13]:
rf.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Fraud_Check
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good


In [15]:
rf["Taxable.Income"].describe()

count      600.000000
mean     55208.375000
std      26204.827597
min      10003.000000
25%      32871.500000
50%      55074.500000
75%      78611.750000
max      99619.000000
Name: Taxable.Income, dtype: float64

In [16]:
rf['Fraud_Check'].value_counts()

Good     476
Risky    124
Name: Fraud_Check, dtype: int64

In [18]:
rf["Fraud_Check"]=rf["Fraud_Check"].replace(['Risky','Good'],[0,1])
rf['Urban']=rf['Urban'].replace(['YES','NO'],[1,0])
rf['Undergrad']=rf['Undergrad'].replace(['YES','NO'],[1,0])
rf["Marital.Status"]=rf["Marital.Status"].replace(['Single','Divorced','Married'],[0,1,2])

In [19]:
rf

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Fraud_Check
0,0,0,68833,50047,10,1,1
1,1,1,33700,134075,18,1,1
2,0,2,36925,160205,30,1,1
3,1,0,50190,193264,15,1,1
4,0,2,81002,27533,28,0,1
...,...,...,...,...,...,...,...
595,1,1,76340,39492,7,1,1
596,1,1,69967,55369,2,1,1
597,0,1,47334,154058,0,1,1
598,1,2,98592,180083,17,0,1


In [20]:
rf['Fraud_Check'].value_counts()

1    476
0    124
Name: Fraud_Check, dtype: int64

In [21]:
rf['Urban'].value_counts()

1    302
0    298
Name: Urban, dtype: int64

In [23]:
rf['Marital.Status'].value_counts()

0    217
2    194
1    189
Name: Marital.Status, dtype: int64

# Splitting data 

In [24]:
rf.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Fraud_Check
0,0,0,68833,50047,10,1,1
1,1,1,33700,134075,18,1,1
2,0,2,36925,160205,30,1,1
3,1,0,50190,193264,15,1,1
4,0,2,81002,27533,28,0,1


In [25]:
rf.columns

Index(['Undergrad', 'Marital.Status', 'Taxable.Income', 'City.Population',
       'Work.Experience', 'Urban', 'Fraud_Check'],
      dtype='object')

In [28]:
rf1=rf[['Undergrad', 'Marital.Status', 'City.Population',
       'Work.Experience', 'Urban','Fraud_Check']]

In [29]:
rf1.head()

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Fraud_Check
0,0,0,50047,10,1,1
1,1,1,134075,18,1,1
2,0,2,160205,30,1,1
3,1,0,193264,15,1,1
4,0,2,27533,28,0,1


In [30]:
X=rf1.iloc[:,:-1]
Y=rf1.iloc[:,-1]

In [31]:
X.head()

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban
0,0,0,50047,10,1
1,1,1,134075,18,1
2,0,2,160205,30,1
3,1,0,193264,15,1
4,0,2,27533,28,0


In [32]:
Y.head()

0    1
1    1
2    1
3    1
4    1
Name: Fraud_Check, dtype: category
Categories (2, int64): [0 < 1]

In [33]:
rf1.columns

Index(['Undergrad', 'Marital.Status', 'City.Population', 'Work.Experience',
       'Urban', 'Fraud_Check'],
      dtype='object')

# Splitting data into train and test data

In [34]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=10)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((480, 5), (120, 5), (480,), (120,))

In [35]:
x_train

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban
284,0,1,188383,22,1
423,0,2,80249,19,1
525,1,0,75307,23,0
519,1,0,162102,25,1
12,0,1,148033,14,1
...,...,...,...,...,...
369,0,2,89122,28,0
320,1,0,78897,11,1
527,0,2,180274,27,0
125,1,1,155669,11,0


In [36]:
y_train

284    0
423    1
525    1
519    1
12     0
      ..
369    1
320    1
527    1
125    1
265    0
Name: Fraud_Check, Length: 480, dtype: category
Categories (2, int64): [0 < 1]

# Doing Grid Search CV To find Best Hyperparameter

In [37]:
from sklearn.model_selection import GridSearchCV

In [38]:
model=RandomForestClassifier()
parameters={
    
    "n_estimators":[130, 140, 150, 160],
    "max_features":[2,3,4,5]
}

In [39]:
Grid_DT=GridSearchCV(model,parameters,cv=10)

In [40]:
Grid_DT.fit(x_train,y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'max_features': [2, 3, 4, 5],
                         'n_estimators': [130, 140, 150, 160]})

In [41]:
Grid_DT.best_score_

0.7666666666666666

In [42]:
Grid_DT.best_params_

{'max_features': 3, 'n_estimators': 130}

In [43]:
final_model=RandomForestClassifier(max_features=2, n_estimators=130)

In [44]:
final_model.fit(x_train,y_train)

RandomForestClassifier(max_features=2, n_estimators=130)

In [45]:
print(classification_report(y_test,final_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.76      0.89      0.82        94

    accuracy                           0.70       120
   macro avg       0.38      0.45      0.41       120
weighted avg       0.60      0.70      0.65       120



In [46]:
final_model.feature_importances_

array([0.03879484, 0.06587816, 0.53218403, 0.32608951, 0.03705346])