In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
##Loading Data
data=pd.read_csv('Fraud_check.csv')
data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [3]:
##Transforming taxable income to categorial column
data.loc[data['Taxable.Income']<=30000,'Income']='Risky'
data.loc[data['Taxable.Income']>30000,'Income']='Good'

In [4]:
pd.set_option("display.max_rows",None)
new_data=data
new_data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Income
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good
5,NO,Divorced,33329,116382,0,NO,Good
6,NO,Divorced,83357,80890,8,YES,Good
7,YES,Single,62774,131253,3,YES,Good
8,NO,Single,83519,102481,12,YES,Good
9,YES,Divorced,98152,155482,4,YES,Good


In [5]:
##drop Taxable Income Column
new_data=new_data.drop('Taxable.Income',axis=1)
new_data

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Income
0,NO,Single,50047,10,YES,Good
1,YES,Divorced,134075,18,YES,Good
2,NO,Married,160205,30,YES,Good
3,YES,Single,193264,15,YES,Good
4,NO,Married,27533,28,NO,Good
5,NO,Divorced,116382,0,NO,Good
6,NO,Divorced,80890,8,YES,Good
7,YES,Single,131253,3,YES,Good
8,NO,Single,102481,12,YES,Good
9,YES,Divorced,155482,4,YES,Good


In [6]:
##Encode Udergraduate,Marital Status and Urban columns using Label Encoder
columns=['Undergrad','Marital.Status','Urban']
LabelEncoder=LabelEncoder()
for i in columns:
    new_data[i]=LabelEncoder.fit_transform(new_data[i])

new_data

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Income
0,0,2,50047,10,1,Good
1,1,0,134075,18,1,Good
2,0,1,160205,30,1,Good
3,1,2,193264,15,1,Good
4,0,1,27533,28,0,Good
5,0,0,116382,0,0,Good
6,0,0,80890,8,1,Good
7,1,2,131253,3,1,Good
8,0,2,102481,12,1,Good
9,1,0,155482,4,1,Good


In [7]:
new_data.value_counts('Undergrad'),new_data.value_counts('Marital.Status'),new_data.value_counts('Urban')

(Undergrad
 1    312
 0    288
 dtype: int64,
 Marital.Status
 2    217
 1    194
 0    189
 dtype: int64,
 Urban
 1    302
 0    298
 dtype: int64)

In [8]:
##Standardise work experience and population column
scale=StandardScaler()
data_norm=scale.fit_transform(new_data.iloc[:,2:4])
data_norm=pd.DataFrame(data_norm,columns=['Population','Experience'])
data_norm

Unnamed: 0,Population,Experience
0,-1.178521,-0.629143
1,0.5085,0.27637
2,1.033109,1.634639
3,1.696831,-0.063197
4,-1.630532,1.408261
5,0.15328,-1.761033
6,-0.559289,-0.855521
7,0.451843,-1.421466
8,-0.125809,-0.402764
9,0.938286,-1.308277


In [9]:
new_data['City.Population']=data_norm['Population']
new_data['Work.Experience']=data_norm['Experience']
new_data

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Income
0,0,2,-1.178521,-0.629143,1,Good
1,1,0,0.5085,0.27637,1,Good
2,0,1,1.033109,1.634639,1,Good
3,1,2,1.696831,-0.063197,1,Good
4,0,1,-1.630532,1.408261,0,Good
5,0,0,0.15328,-1.761033,0,Good
6,0,0,-0.559289,-0.855521,1,Good
7,1,2,0.451843,-1.421466,1,Good
8,0,2,-0.125809,-0.402764,1,Good
9,1,0,0.938286,-1.308277,1,Good


In [10]:
##Assign columns to variables
x=new_data.iloc[:,0:5]
y=new_data.iloc[:,5]

In [11]:
##Segregating data into 100 folds and then assigning random classifier on data
kfold=KFold(n_splits=100,shuffle=True,random_state=42)
num_trees=100
max_features=3
Randomclf=RandomForestClassifier(n_estimators=num_trees,max_features=max_features)

In [12]:
Randomclf.fit(x,y)

In [13]:
Randomclf.predict(x)

array(['Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Risky', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Risky',
       'Good', 'Good', 'Risky', 'Good', 'Good', 'Risky', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Risky',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Risky',
       'Good', 'Risky', 'Good', 'Good', 'Good', 'Risky', 'Risky', 'Good',
       'Risky', 'Good', 'Risky', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Good', 'Good'

In [None]:
cross_val_score=cross_val_score(Randomclf,x,y,cv=kfold)
cross_val_score

In [None]:
accuracy=print(cross_val_score.mean())
accuracy