In [34]:
import pandas as pd

In [35]:
df = pd.read_csv('./dataset_feature_engineering.csv')
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,first_time_at_merchant,dist_between_client_and_merch,trans_month,trans_day,hour,year,times_shopped_at_merchant,times_shopped_at_merchant_year,times_shopped_at_merchant_month,times_shopped_at_merchant_day
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,...,True,78.773821,1,1,0,2019,5,4,2,1
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,...,True,30.216618,1,1,0,2019,4,4,1,1
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,...,True,108.102912,1,1,0,2019,4,3,1,1
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,...,True,95.685115,1,1,0,2019,1,1,1,1
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,...,True,77.702395,1,1,0,2019,6,1,1,1


In [36]:
df = df[
	(df['trans_day'] >= 1) &
	(df['trans_month'] >= 1) &
	(df['trans_month'] <= 12)
].copy()

# Create datetime column
df['trans_datetime'] = pd.to_datetime({
	'year': df['year'],
	'month': df['trans_month'],
	'day': df['trans_day'],
	'hour': df['hour']
})

# Sort by datetime
df = df.sort_values(by='trans_datetime').reset_index(drop=True)

# Optional: bring datetime column to the front
cols = df.columns.tolist()
cols.insert(0, cols.pop(cols.index('trans_datetime')))
df = df[cols]
df.head()

Unnamed: 0,trans_datetime,cc_num,merchant,category,amt,first,last,gender,street,city,...,first_time_at_merchant,dist_between_client_and_merch,trans_month,trans_day,hour,year,times_shopped_at_merchant,times_shopped_at_merchant_year,times_shopped_at_merchant_month,times_shopped_at_merchant_day
0,2019-01-01,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,True,78.773821,1,1,0,2019,5,4,2,1
1,2019-01-01,36078114201167,fraud_Murray-Smitham,grocery_pos,159.54,Christopher,Horn,M,956 Sanchez Highway,Mallie,...,True,35.921066,1,1,0,2019,4,2,1,2
2,2019-01-01,3567527758368741,fraud_Beier LLC,entertainment,4.01,Amanda,Vance,F,14601 Downs Skyway Apt. 440,Sterling City,...,True,75.991336,1,1,0,2019,3,2,1,1
3,2019-01-01,4026222041577,"fraud_Huel, Hammes and Witting",grocery_pos,195.22,Debbie,Payne,F,204 Ashley Neck Apt. 169,Preston,...,True,86.277046,1,1,0,2019,1,1,1,1
4,2019-01-01,341546199006537,"fraud_Bins, Balistreri and Beatty",shopping_pos,268.16,Mark,Brown,M,8580 Moore Cove,Wales,...,True,75.943368,1,1,0,2019,2,1,1,1


In [37]:
entertainment_keywords = ['cinema', 'bar', 'concert', 'club', 'theater', 'entertainment', 'event', 'music', 'party', 'art', 'park', 'museum', 'zoo', 'shopping', 'dining', 'video']

In [38]:
df['is_entertainment'] = df['category'].str.lower().fillna('').apply(
	lambda x: any(keyword in x for keyword in entertainment_keywords)
) | df['merchant'].str.lower().fillna('').apply(
	lambda x: any(keyword in x for keyword in entertainment_keywords)
)

entertainment_df = df[df['is_entertainment']]
entertainment_df.head()

Unnamed: 0,trans_datetime,cc_num,merchant,category,amt,first,last,gender,street,city,...,dist_between_client_and_merch,trans_month,trans_day,hour,year,times_shopped_at_merchant,times_shopped_at_merchant_year,times_shopped_at_merchant_month,times_shopped_at_merchant_day,is_entertainment
2,2019-01-01,3567527758368741,fraud_Beier LLC,entertainment,4.01,Amanda,Vance,F,14601 Downs Skyway Apt. 440,Sterling City,...,75.991336,1,1,0,2019,3,2,1,1,True
4,2019-01-01,341546199006537,"fraud_Bins, Balistreri and Beatty",shopping_pos,268.16,Mark,Brown,M,8580 Moore Cove,Wales,...,75.943368,1,1,0,2019,2,1,1,1,True
10,2019-01-01,675909898057,"fraud_Armstrong, Walter and Gottlieb",food_dining,48.4,Christopher,Henry,M,1198 Robert Stravenue Apt. 479,Armonk,...,85.713334,1,1,0,2019,1,1,1,1,True
11,2019-01-01,4727244663135968,"fraud_Streich, Dietrich and Barton",shopping_net,2.21,Mary,Lewis,F,118 Justin Extension,Bay Minette,...,95.46179,1,1,0,2019,3,2,1,2,True
14,2019-01-01,30427035050508,"fraud_Effertz, Welch and Schowalter",entertainment,42.78,John,Chandler,M,88325 Brandon Greens Apt. 477,Detroit,...,111.987801,1,1,0,2019,4,3,1,1,True


In [39]:
features = [
	'is_fraud',
	'category',
	'trans_datetime',
	'amt',
	'amt_month',
	'amt_year',
	'merchant',
	'amt_month_shopping_net_spend',
	'first_time_at_merchant',
	'dist_between_client_and_merch',
	'times_shopped_at_merchant',
	'cc_num',
]

entertainment_df = entertainment_df[features]

entertainment_df['avg_spend_per_visit'] = entertainment_df['amt_month'] / (entertainment_df['times_shopped_at_merchant'] + 1)
entertainment_df['spend_ratio'] = entertainment_df['amt'] / (entertainment_df['amt_month'] + 1)

entertainment_df.head()

Unnamed: 0,is_fraud,category,trans_datetime,amt,amt_month,amt_year,merchant,amt_month_shopping_net_spend,first_time_at_merchant,dist_between_client_and_merch,times_shopped_at_merchant,cc_num,avg_spend_per_visit,spend_ratio
2,0,entertainment,2019-01-01,4.01,2481.52,2481.52,fraud_Beier LLC,6.67,True,75.991336,3,3567527758368741,620.38,0.001615
4,0,shopping_pos,2019-01-01,268.16,12384.35,12384.35,"fraud_Bins, Balistreri and Beatty",3016.67,True,75.943368,2,341546199006537,4128.116667,0.021651
10,0,food_dining,2019-01-01,48.4,1318.11,1318.11,"fraud_Armstrong, Walter and Gottlieb",7.57,True,85.713334,1,675909898057,659.055,0.036691
11,0,shopping_net,2019-01-01,2.21,3449.02,3449.02,"fraud_Streich, Dietrich and Barton",723.1,True,95.46179,3,4727244663135968,862.255,0.000641
14,0,entertainment,2019-01-01,42.78,2006.67,2006.67,"fraud_Effertz, Welch and Schowalter",4.63,True,111.987801,4,30427035050508,401.334,0.021308
