In [4]:
import numpy as np
import pandas as pd
import seaborn as sns


# Model Building Part

### Removing unnnecessary columns from entire dataset

In [5]:
df = pd.read_csv("cleaned_upi_data.csv")


In [6]:
columns_to_drop=['Days_Since_Last_Transaction','Transaction_Frequency','Transaction_Amount_Deviation','Transaction_Status','Time',
                 'Date','Device_OS','Transaction_Channel','Transaction_City']
df.drop(columns=columns_to_drop,inplace=True,errors='ignore')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Merchant_ID        647 non-null    object 
 1   Transaction_Type   647 non-null    object 
 2   Payment_Gateway    647 non-null    object 
 3   Transaction_State  647 non-null    object 
 4   IP_Address         647 non-null    object 
 5   Merchant_Category  647 non-null    object 
 6   amount             647 non-null    float64
 7   fraud              647 non-null    int64  
 8   Year               647 non-null    int64  
 9   Month              647 non-null    object 
dtypes: float64(1), int64(2), object(7)
memory usage: 50.7+ KB


⚠️ Observation:
In the first notebook, column Year is:

yaml
Copy
Edit
Year: int32
memory usage: 48.1+ KB
In the new notebook, column Year is:

yaml
Copy
Edit
Year: int64
memory usage: 50.7+ KB
✅ So, What Changed?
The data type of the Year column changed from int32 to int64 when you reloaded the CSV file in the second notebook.


CSV files do not store data type metadata, so when you reload them, pandas defaults to int64 for integers.

📌 Why This Affects Memory:
int32 = 4 bytes per value

int64 = 8 bytes per value
So 647 entries × 4 bytes = 2.6 KB more used — that’s exactly the difference you noticed.



✅ How to Fix It (If You Want Consistency)
When reading the CSV, force the Year column to be int32:

python
Copy
Edit
df = pd.read_csv("your_cleaned_data.csv", dtype={'Year': 'int32'})
This will reduce memory and keep data types consistent with the original notebook.



🧠 TL;DR:
Memory usage increased because Year became int64 instead of int32.

This happens because CSVs don’t preserve column types.

Use dtype= in read_csv() to explicitly control column types.

In [8]:
df = pd.read_csv("your_cleaned_data.csv", dtype={'Year': 'int32'})


FileNotFoundError: [Errno 2] No such file or directory: 'your_cleaned_data.csv'

In [9]:
import os
os.listdir()

['.ipynb_checkpoints',
 'cleaned_upi_data.csv',
 'Model_Training.ipynb',
 'upi1.ipynb']

In [10]:
import pandas as pd

df = pd.read_csv("cleaned_upi_data.csv", dtype={'Year': 'int32'})
#this is to bring consistency in the data as per the previous cleane data


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Merchant_ID        647 non-null    object 
 1   Transaction_Type   647 non-null    object 
 2   Payment_Gateway    647 non-null    object 
 3   Transaction_State  647 non-null    object 
 4   IP_Address         647 non-null    object 
 5   Merchant_Category  647 non-null    object 
 6   amount             647 non-null    float64
 7   fraud              647 non-null    int64  
 8   Year               647 non-null    int32  
 9   Month              647 non-null    object 
dtypes: float64(1), int32(1), int64(1), object(7)
memory usage: 48.1+ KB


In [12]:
df.shape

(647, 10)

In [13]:
df.columns

Index(['Merchant_ID', 'Transaction_Type', 'Payment_Gateway',
       'Transaction_State', 'IP_Address', 'Merchant_Category', 'amount',
       'fraud', 'Year', 'Month'],
      dtype='object')

### encoding and scaling(first learn it)

In [14]:
data=df.copy()

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Merchant_ID        647 non-null    object 
 1   Transaction_Type   647 non-null    object 
 2   Payment_Gateway    647 non-null    object 
 3   Transaction_State  647 non-null    object 
 4   IP_Address         647 non-null    object 
 5   Merchant_Category  647 non-null    object 
 6   amount             647 non-null    float64
 7   fraud              647 non-null    int64  
 8   Year               647 non-null    int32  
 9   Month              647 non-null    object 
dtypes: float64(1), int32(1), int64(1), object(7)
memory usage: 48.1+ KB


In [16]:
#converting year column to object
data['Year']=data['Year'].astype('object')

In [17]:
df.to_csv("cleaned_upi_data.csv", index=False)

In [18]:
#extracting categorical columns from data
categorical_cols=data.select_dtypes(include=['object']).columns
categorical_cols

Index(['Merchant_ID', 'Transaction_Type', 'Payment_Gateway',
       'Transaction_State', 'IP_Address', 'Merchant_Category', 'Year',
       'Month'],
      dtype='object')

In [19]:
from sklearn.preprocessing import LabelEncoder

cols = ['Year','Month']
label_encoder= LabelEncoder()
for col in cols:
    data[cols]=label_encoder.fit_transform(data[col])

ValueError: Columns must be same length as key

In [20]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
cols = ['Year', 'Month']

for col in cols:
    data[col] = label_encoder.fit_transform(data[col])


In [24]:
#non ordinal variables
non_ordinal=['Transaction_Type','Payment_Gateway','Transaction_State','Merchant_Category']

#performing one hot encoding for non_ordinal variables
data=pd.get_dummies(data,columns=non_ordinal,drop_first=True)

#convert boolean columns to integers
data=data.astype(int)

KeyError: "None of [Index(['Transaction_Type', 'Payment_Gateway', 'Transaction_State',\n       'Merchant_Category'],\n      dtype='object')] are in the [columns]"

In [22]:
#non ordinal variables
non_ordinal=['Transaction_Type','Payment_Gateway','Transaction_State','Merchant_Category']

#performing one hot encoding for non_ordinal variables
data=pd.get_dummies(data,columns=non_ordinal,drop_first=True)

# Only convert boolean columns to int
for col in data.select_dtypes(include='bool').columns:
    data[col] = data[col].astype(int)



KeyError: "None of [Index(['Transaction_Type', 'Payment_Gateway', 'Transaction_State',\n       'Merchant_Category'],\n      dtype='object')] are in the [columns]"

In [23]:
print("Available columns in data:")
print(data.columns)

print("Missing columns:")
print([col for col in non_ordinal if col not in data.columns])


Available columns in data:
Index(['Merchant_ID', 'IP_Address', 'amount', 'fraud', 'Year', 'Month',
       'Transaction_Type_Bill Payment', 'Transaction_Type_Investment',
       'Transaction_Type_Other', 'Transaction_Type_Purchase',
       'Transaction_Type_Refund', 'Transaction_Type_Subscription',
       'Payment_Gateway_Bank of Data', 'Payment_Gateway_CReditPAY',
       'Payment_Gateway_Dummy Bank', 'Payment_Gateway_Gamma Bank',
       'Payment_Gateway_Other', 'Payment_Gateway_SamplePay',
       'Payment_Gateway_Sigma Bank', 'Payment_Gateway_UPI Pay',
       'Transaction_State_Arunachal Pradesh', 'Transaction_State_Assam',
       'Transaction_State_Bihar', 'Transaction_State_Chhattisgarh',
       'Transaction_State_Goa', 'Transaction_State_Gujarat',
       'Transaction_State_Haryana', 'Transaction_State_Himachal Pradesh',
       'Transaction_State_Jharkhand', 'Transaction_State_Karnataka',
       'Transaction_State_Kerala', 'Transaction_State_Madhya Pradesh',
       'Transaction_State

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Merchant_ID        647 non-null    object 
 1   Transaction_Type   647 non-null    object 
 2   Payment_Gateway    647 non-null    object 
 3   Transaction_State  647 non-null    object 
 4   IP_Address         647 non-null    object 
 5   Merchant_Category  647 non-null    object 
 6   amount             647 non-null    float64
 7   fraud              647 non-null    int64  
 8   Year               647 non-null    int32  
 9   Month              647 non-null    object 
dtypes: float64(1), int32(1), int64(1), object(7)
memory usage: 48.1+ KB


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 56 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Merchant_ID                                     647 non-null    object 
 1   IP_Address                                      647 non-null    object 
 2   amount                                          647 non-null    float64
 3   fraud                                           647 non-null    int64  
 4   Year                                            647 non-null    int64  
 5   Month                                           647 non-null    int64  
 6   Transaction_Type_Bill Payment                   647 non-null    bool   
 7   Transaction_Type_Investment                     647 non-null    bool   
 8   Transaction_Type_Other                          647 non-null    bool   
 9   Transaction_Type_Purchase                  

In [27]:
# Convert boolean columns to int, if any
for col in data.select_dtypes(include='bool').columns:
    data[col] = data[col].astype(int)

# Proceed with model training / analysis


In [28]:
data.head()

Unnamed: 0,Merchant_ID,IP_Address,amount,fraud,Year,Month,Transaction_Type_Bill Payment,Transaction_Type_Investment,Transaction_Type_Other,Transaction_Type_Purchase,...,Transaction_State_West Bengal,Merchant_Category_Donations and Devotion,Merchant_Category_Financial services and Taxes,Merchant_Category_Home delivery,Merchant_Category_Investment,Merchant_Category_More Services,Merchant_Category_Other,Merchant_Category_Purchases,Merchant_Category_Travel bookings,Merchant_Category_Utilities
0,f65a902b-2396-40cc-9593-97e103f1bc15,140.213.7.48,396.62,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,d4a5efcb-4eb6-4d3a-8132-07bb3e6e13a4,184.108.177.45,121.94,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,759ad138-9473-4729-8699-3d72c7ffb983,16.106.248.163,106.69,1,0,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,d8f561e4-bded-4ef0-bcd8-5494b2e31a94,65.245.160.212,3611.11,1,1,4,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,26fdd7a1-8537-4dfe-bcf7-f5a127b36682,33.172.152.38,374.89,1,0,6,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [29]:
data.dtypes

Merchant_ID                                        object
IP_Address                                         object
amount                                            float64
fraud                                               int64
Year                                                int64
Month                                               int64
Transaction_Type_Bill Payment                       int64
Transaction_Type_Investment                         int64
Transaction_Type_Other                              int64
Transaction_Type_Purchase                           int64
Transaction_Type_Refund                             int64
Transaction_Type_Subscription                       int64
Payment_Gateway_Bank of Data                        int64
Payment_Gateway_CReditPAY                           int64
Payment_Gateway_Dummy Bank                          int64
Payment_Gateway_Gamma Bank                          int64
Payment_Gateway_Other                               int64
Payment_Gatewa

In [30]:
# Drop Merchant_ID and IP_Address
data.drop(columns=['Merchant_ID', 'IP_Address'], inplace=True)

# Convert 'amount' to int64 (removes decimal places)
data['amount'] = data['amount'].astype(int)





In [31]:
print(data.dtypes)

amount                                            int64
fraud                                             int64
Year                                              int64
Month                                             int64
Transaction_Type_Bill Payment                     int64
Transaction_Type_Investment                       int64
Transaction_Type_Other                            int64
Transaction_Type_Purchase                         int64
Transaction_Type_Refund                           int64
Transaction_Type_Subscription                     int64
Payment_Gateway_Bank of Data                      int64
Payment_Gateway_CReditPAY                         int64
Payment_Gateway_Dummy Bank                        int64
Payment_Gateway_Gamma Bank                        int64
Payment_Gateway_Other                             int64
Payment_Gateway_SamplePay                         int64
Payment_Gateway_Sigma Bank                        int64
Payment_Gateway_UPI Pay                         

### To improve the computational efficiency,taking 50% of entire data for model building and testing keeping the ratio intact

In [32]:
from sklearn.model_selection import train_test_split

#split the data into 'fraud' and 'non-fraud' classes
non_fraud_data=data[data['fraud']==1]
non_fraud_data=data[data['fraud']==0]

#sample each class separately while maintaining proportions 
sampled_fraud_data=fraud_data.sample(frac=0.5, random_state=42)
sampled_non_fraud_data=non_fraud_data.sample(frac=0.5,random_state=42)

#combine the sample data back together
data=pd.concat([sampled_fraud_data,sampled_non_fraud_data])

NameError: name 'fraud_data' is not defined

In [33]:
from sklearn.model_selection import train_test_split

# Split the data into fraud and non-fraud classes
fraud_data = data[data['fraud'] == 1]
non_fraud_data = data[data['fraud'] == 0]

# Sample each class separately (e.g., 50% of each class)
sampled_fraud_data = fraud_data.sample(frac=0.5, random_state=42)
sampled_non_fraud_data = non_fraud_data.sample(frac=0.5, random_state=42)

# Combine the sampled data back together
data = pd.concat([sampled_fraud_data, sampled_non_fraud_data]).sample(frac=1, random_state=42).reset_index(drop=True)


In [34]:
print(data['fraud'].value_counts()) #cross checking class balance


fraud
0    246
1     78
Name: count, dtype: int64


# MODEL BUILDING