<a href="https://colab.research.google.com/github/AyushShrestha404/transaction_categorization_model/blob/main/transaction_categorizaton_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [24]:
df = pd.read_csv("digital_wallet_transactions.csv")
df.head()

Unnamed: 0,idx,transaction_id,user_id,transaction_date,product_category,product_name,merchant_name,product_amount,transaction_fee,cashback,loyalty_points,payment_method,transaction_status,merchant_id,device_type,location
0,1,4dac3ea3-6492-46ec-80b8-dc45c3ad0b14,USER_05159,2023-08-19 03:32,Rent Payment,2BHK Flat Deposit,Airbnb,1525.39,36.69,19.19,186,Debit Card,Successful,MERCH_0083,iOS,Urban
1,2,a903ed9f-eb84-47e7-b8aa-fd1786c919cf,USER_07204,2023-08-19 04:37,Gas Bill,Commercial Gas Connection,Adani Gas,1495.4,28.19,89.99,182,UPI,Successful,MERCH_0163,iOS,Urban
2,3,2a393013-733c-4add-9f09-bed1eeb33676,USER_00903,2023-08-19 05:52,Bus Ticket,Semi-Sleeper,MakeMyTrip Bus,1267.71,11.36,95.7,994,UPI,Successful,MERCH_0320,iOS,Urban
3,4,9a07ad19-4673-4794-9cd2-9b139f39c715,USER_01769,2023-08-19 06:35,Internet Bill,4G Unlimited Plan,Airtel Broadband,9202.63,6.41,82.24,409,Debit Card,Successful,MERCH_0194,Android,Urban
4,5,76418260-c985-4011-979d-0914604d0d68,USER_03544,2023-08-19 06:36,Loan Repayment,Home Loan EMI,Axis Bank,3100.58,41.15,40.47,837,Debit Card,Successful,MERCH_0504,Android,Urban


In [25]:
df_model = df.drop(columns = ['idx', 'transaction_id', 'user_id', 'merchant_id'])
df_model

Unnamed: 0,transaction_date,product_category,product_name,merchant_name,product_amount,transaction_fee,cashback,loyalty_points,payment_method,transaction_status,device_type,location
0,2023-08-19 03:32,Rent Payment,2BHK Flat Deposit,Airbnb,1525.39,36.69,19.19,186,Debit Card,Successful,iOS,Urban
1,2023-08-19 04:37,Gas Bill,Commercial Gas Connection,Adani Gas,1495.40,28.19,89.99,182,UPI,Successful,iOS,Urban
2,2023-08-19 05:52,Bus Ticket,Semi-Sleeper,MakeMyTrip Bus,1267.71,11.36,95.70,994,UPI,Successful,iOS,Urban
3,2023-08-19 06:35,Internet Bill,4G Unlimited Plan,Airtel Broadband,9202.63,6.41,82.24,409,Debit Card,Successful,Android,Urban
4,2023-08-19 06:36,Loan Repayment,Home Loan EMI,Axis Bank,3100.58,41.15,40.47,837,Debit Card,Successful,Android,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2024-08-17 19:39,Internet Bill,4G Unlimited Plan,BSNL Broadband,9496.45,37.25,87.47,284,UPI,Successful,Android,Rural
4996,2024-08-17 19:42,Education Fee,Python Programming Course,Unacademy,710.40,14.01,55.24,538,Credit Card,Successful,iOS,Urban
4997,2024-08-17 23:24,Movie Ticket,Avengers: Endgame,PVR Cinemas,2750.06,48.69,72.10,652,Bank Transfer,Successful,Android,Suburban
4998,2024-08-18 00:28,Mobile Recharge,Data Pack 50GB,BSNL,9976.19,21.96,40.44,614,UPI,Successful,iOS,Suburban


In [26]:
df_model['transaction_date'] = pd.to_datetime(df_model['transaction_date'])
df_model['day'] = df_model['transaction_date'].dt.day
df_model['hour'] = df_model['transaction_date'].dt.hour
df_model = df_model.drop(columns=['transaction_date'])

In [30]:
def reduce_categories(series, threshold=0.01):
  freq = series.value_counts(normalize = True)
  rare = freq[freq < threshold].index
  return series.apply(lambda x: 'Other' if x in rare else x)

In [31]:
for col in ['merchant_name', 'product_name']:
  df_model[col] = reduce_categories(df_model[col])

In [32]:
categorical_cols = df_model.select_dtypes(include=['object']).columns.drop('product_category')
label_encoders ={}
for col in categorical_cols:
  le = LabelEncoder()
  df_model[col] = le.fit_transform(df_model[col])
  label_encoders[col] = le

In [33]:
target_encoder = LabelEncoder()
df_model['product_category'] = target_encoder.fit_transform(df_model['product_category'])

In [35]:
x = df_model.drop(columns=['product_category'])
y = df_model['product_category']

In [36]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [39]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size = 0.2, random_state = 42)

In [40]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

In [41]:
y_pred = rf.predict(x_test)
print(classification_report(y_test, y_pred, target_names=target_encoder.classes_))

                   precision    recall  f1-score   support

       Bus Ticket       0.47      0.60      0.53        53
    Education Fee       0.81      0.81      0.81        53
 Electricity Bill       0.65      0.56      0.60        50
   Flight Booking       0.51      0.55      0.53        33
    Food Delivery       0.86      0.73      0.79        44
   Gaming Credits       0.43      0.46      0.44        61
         Gas Bill       0.68      0.68      0.68        50
        Gift Card       0.31      0.37      0.33        41
 Grocery Shopping       0.54      0.60      0.57        45
    Hotel Booking       0.96      0.94      0.95        54
Insurance Premium       0.33      0.46      0.38        46
    Internet Bill       0.42      0.55      0.48        44
   Loan Repayment       0.63      0.66      0.64        44
  Mobile Recharge       0.71      0.53      0.61        47
     Movie Ticket       1.00      0.79      0.88        56
  Online Shopping       0.53      0.56      0.55       