In [2]:
# Simple data entry and quick data cleaning
import pandas as pd
df = pd.read_csv("fraudTrain.csv")
df_copy = df
df['name'] = df['first'] + ' ' + df['last']
df_fraud = df[df['is_fraud'] == 1]
fraud_rate = len(df_fraud)/len(df)
df = df.drop(columns=['Unnamed: 0','first', 'last','gender', 'street', 'lat', 'long', 'city_pop', 'job', 'dob', 'is_fraud', "merch_lat", 'merch_long'])
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,city,state,zip,trans_num,unix_time,name
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Moravian Falls,NC,28654,0b242abb623afc578575680df30655b9,1325376018,Jennifer Banks
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Orient,WA,99160,1f76529f8574734946361c461b024d99,1325376044,Stephanie Gill
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Malad City,ID,83252,a1a22d70485983eac12b5b88dad1cf95,1325376051,Edward Sanchez
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Boulder,MT,59632,6b849c168bdad6f867558c3793159a81,1325376076,Jeremy White
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Doe Hill,VA,24433,a41d7549acf90789359a9aa5346dcb46,1325376186,Tyler Garcia


In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# Remove original 'category' column if it's replaced by encoded version
df = df.drop(columns=['category'])
df

Unnamed: 0,trans_date_trans_time,cc_num,merchant,amt,city,state,zip,trans_num,unix_time,name,category_encoded
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",4.97,Moravian Falls,NC,28654,0b242abb623afc578575680df30655b9,1325376018,Jennifer Banks,8
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",107.23,Orient,WA,99160,1f76529f8574734946361c461b024d99,1325376044,Stephanie Gill,4
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,220.11,Malad City,ID,83252,a1a22d70485983eac12b5b88dad1cf95,1325376051,Edward Sanchez,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",45.00,Boulder,MT,59632,6b849c168bdad6f867558c3793159a81,1325376076,Jeremy White,2
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,41.96,Doe Hill,VA,24433,a41d7549acf90789359a9aa5346dcb46,1325376186,Tyler Garcia,9
...,...,...,...,...,...,...,...,...,...,...,...
1296670,2020-06-21 12:12:08,30263540414123,fraud_Reichel Inc,15.56,Hatch,UT,84735,440b587732da4dc1a6395aba5fb41669,1371816728,Erik Patterson,0
1296671,2020-06-21 12:12:19,6011149206456997,fraud_Abernathy and Sons,51.70,Tuscarora,MD,21790,278000d2e0d2277d1de2f890067dcc0a,1371816739,Jeffrey White,1
1296672,2020-06-21 12:12:32,3514865930894695,fraud_Stiedemann Ltd,105.93,High Rolls Mountain Park,NM,88325,483f52fe67fabef353d552c1e662974c,1371816752,Christopher Castaneda,1
1296673,2020-06-21 12:13:36,2720012583106919,"fraud_Reinger, Weissnat and Strosin",74.90,Manderson,SD,57756,d667cdcbadaaed3da3f4020e83591c83,1371816816,Joseph Murray,1


In [4]:
# Data pre-processing, creates a dictionary of dataframes, each dataframe corresponds to a unique user
users = df['name'].unique()

data = { }
for i in users:
    user_subset = df[df['name'] == i]
    data[i] = user_subset

In [5]:
# Isolation Forest model
from sklearn.ensemble import IsolationForest

models = { }

for user, user_dataframe in data.items():
    features = user_dataframe[['category_encoded', 'amt', 'zip']].values
    model = IsolationForest(n_estimators=100, contamination=fraud_rate)
    model.fit(features)

    models[user] = model

In [7]:
# Predictions 

predictions = [ ]
for index, row in df.iterrows():
    row_values = row[['category_encoded', 'amt', 'zip']].values
    name = row['name']
    model = models[name]
    prediction = model.predict([row_values])
    predictions.append(prediction[0])

df_copy['prediction'] = predictions
df_copy

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,name,prediction
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,Jennifer Banks,1
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,Stephanie Gill,1
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,Edward Sanchez,1
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,Jeremy White,1
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,Tyler Garcia,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,1296670,2020-06-21 12:12:08,30263540414123,fraud_Reichel Inc,entertainment,15.56,Erik,Patterson,M,162 Jessica Row Apt. 072,...,258,Geoscientist,1961-11-24,440b587732da4dc1a6395aba5fb41669,1371816728,36.841266,-111.690765,0,Erik Patterson,1
1296671,1296671,2020-06-21 12:12:19,6011149206456997,fraud_Abernathy and Sons,food_dining,51.70,Jeffrey,White,M,8617 Holmes Terrace Suite 651,...,100,"Production assistant, television",1979-12-11,278000d2e0d2277d1de2f890067dcc0a,1371816739,38.906881,-78.246528,0,Jeffrey White,1
1296672,1296672,2020-06-21 12:12:32,3514865930894695,fraud_Stiedemann Ltd,food_dining,105.93,Christopher,Castaneda,M,1632 Cohen Drive Suite 639,...,899,Naval architect,1967-08-30,483f52fe67fabef353d552c1e662974c,1371816752,33.619513,-105.130529,0,Christopher Castaneda,1
1296673,1296673,2020-06-21 12:13:36,2720012583106919,"fraud_Reinger, Weissnat and Strosin",food_dining,74.90,Joseph,Murray,M,42933 Ryan Underpass,...,1126,Volunteer coordinator,1980-08-18,d667cdcbadaaed3da3f4020e83591c83,1371816816,42.788940,-103.241160,0,Joseph Murray,1


In [15]:
frauds = 0
for index, row in df_copy.iterrows():
    if ((row['is_fraud'] == 1) & (row['prediction'] == -1)):
        frauds += 1
print(frauds)
print(len(df_fraud))

2192
7506


In [46]:
import seaborn as sns
import matplotlib.pyplot as plt

df = df.drop('trans_date_trans_time', axis=1)
column_list = df.columns.tolist()
correlation_matrix = df[column_list].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()


KeyError: "['trans_date_trans_time'] not found in axis"