Use Random Forest to prepare a model on fraud data 
treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

In [13]:
#import libraries

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize']=(10,5)
plt.rcParams['figure.dpi'] = 250
sns.set_theme(style='darkgrid', palette='rainbow')
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import r2_score,mean_squared_error,classification_report,accuracy_score

In [3]:
#Read data

df = pd.read_csv('Fraud_check (1).csv')
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [4]:
#Using get dummies

df = pd.get_dummies(df, columns=['Undergrad','Marital.Status','Urban'], drop_first=True)


In [5]:
df

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES
0,68833,50047,10,0,0,1,1
1,33700,134075,18,1,0,0,1
2,36925,160205,30,0,1,0,1
3,50190,193264,15,1,0,1,1
4,81002,27533,28,0,1,0,0
...,...,...,...,...,...,...,...
595,76340,39492,7,1,0,0,1
596,69967,55369,2,1,0,0,1
597,47334,154058,0,0,0,0,1
598,98592,180083,17,1,1,0,0


In [6]:
#Treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

df['income'] ='>=30000'
df.loc[df['Taxable.Income'] >=30000, 'income'] = 'Good'
df.loc[df['Taxable.Income'] <=30000, 'income'] = 'Risky'

In [7]:
df

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES,income
0,68833,50047,10,0,0,1,1,Good
1,33700,134075,18,1,0,0,1,Good
2,36925,160205,30,0,1,0,1,Good
3,50190,193264,15,1,0,1,1,Good
4,81002,27533,28,0,1,0,0,Good
...,...,...,...,...,...,...,...,...
595,76340,39492,7,1,0,0,1,Good
596,69967,55369,2,1,0,0,1,Good
597,47334,154058,0,0,0,0,1,Good
598,98592,180083,17,1,1,0,0,Good


In [8]:
#Separate feature and target column

x = df.drop('income', axis=1)
y = df['income']

In [9]:
#Splitting data into training and testing data
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=42)
xtrain,ytrain

(     Taxable.Income  City.Population  Work.Experience  Undergrad_YES  \
 145           33962           197422                2              0   
 9             98152           155482                4              1   
 375           41184           130935                2              0   
 523           50286            42198               12              0   
 188           28707           137569               10              0   
 ..              ...              ...              ...            ...   
 71            94287           105680               22              0   
 106           19169            58535               20              1   
 270           30468           130680                5              0   
 435           91547           111774                4              0   
 102           32662            91488               23              1   
 
      Marital.Status_Married  Marital.Status_Single  Urban_YES  
 145                       0                      0      

In [10]:
model = RandomForestClassifier()
model.fit(xtrain,ytrain)
ypred = model.predict(xtest)

In [15]:
#calculate accuarcy 

print(f"Accuracy", accuracy)
print(classification_report(ytest,ypred))

Accuracy 1.0
              precision    recall  f1-score   support

        Good       1.00      1.00      1.00        94
       Risky       1.00      1.00      1.00        26

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

