In [1]:
import numpy as np
import pandas as pd
import seaborn as sns


# Model Building Part

### Removing unnnecessary columns from entire dataset

In [2]:
df = pd.read_csv("cleaned_upi_data.csv")


In [3]:
columns_to_drop=['Days_Since_Last_Transaction','Transaction_Frequency','Transaction_Amount_Deviation','Transaction_Status','Time',
                 'Date','Device_OS','Transaction_Channel','Transaction_City']
df.drop(columns=columns_to_drop,inplace=True,errors='ignore')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Merchant_ID        647 non-null    object 
 1   Transaction_Type   647 non-null    object 
 2   Payment_Gateway    647 non-null    object 
 3   Transaction_State  647 non-null    object 
 4   IP_Address         647 non-null    object 
 5   Merchant_Category  647 non-null    object 
 6   amount             647 non-null    float64
 7   fraud              647 non-null    int64  
 8   Year               647 non-null    int64  
 9   Month              647 non-null    object 
dtypes: float64(1), int64(2), object(7)
memory usage: 50.7+ KB


⚠️ Observation:
In the first notebook, column Year is:

yaml
Copy
Edit
Year: int32
memory usage: 48.1+ KB
In the new notebook, column Year is:

yaml
Copy
Edit
Year: int64
memory usage: 50.7+ KB
✅ So, What Changed?
The data type of the Year column changed from int32 to int64 when you reloaded the CSV file in the second notebook.


CSV files do not store data type metadata, so when you reload them, pandas defaults to int64 for integers.

📌 Why This Affects Memory:
int32 = 4 bytes per value

int64 = 8 bytes per value
So 647 entries × 4 bytes = 2.6 KB more used — that’s exactly the difference you noticed.



✅ How to Fix It (If You Want Consistency)
When reading the CSV, force the Year column to be int32:

python
Copy
Edit
df = pd.read_csv("your_cleaned_data.csv", dtype={'Year': 'int32'})
This will reduce memory and keep data types consistent with the original notebook.



🧠 TL;DR:
Memory usage increased because Year became int64 instead of int32.

This happens because CSVs don’t preserve column types.

Use dtype= in read_csv() to explicitly control column types.

In [5]:
df = pd.read_csv("your_cleaned_data.csv", dtype={'Year': 'int32'})


FileNotFoundError: [Errno 2] No such file or directory: 'your_cleaned_data.csv'

In [6]:
import os
os.listdir()

['.ipynb_checkpoints',
 'cleaned_upi_data.csv',
 'Model_Training.ipynb',
 'upi1.ipynb']

In [7]:
import pandas as pd

df = pd.read_csv("cleaned_upi_data.csv", dtype={'Year': 'int32'})
#this is to bring consistency in the data as per the previous cleane data


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Merchant_ID        647 non-null    object 
 1   Transaction_Type   647 non-null    object 
 2   Payment_Gateway    647 non-null    object 
 3   Transaction_State  647 non-null    object 
 4   IP_Address         647 non-null    object 
 5   Merchant_Category  647 non-null    object 
 6   amount             647 non-null    float64
 7   fraud              647 non-null    int64  
 8   Year               647 non-null    int32  
 9   Month              647 non-null    object 
dtypes: float64(1), int32(1), int64(1), object(7)
memory usage: 48.1+ KB


In [9]:
df.shape

(647, 10)

In [10]:
df.columns

Index(['Merchant_ID', 'Transaction_Type', 'Payment_Gateway',
       'Transaction_State', 'IP_Address', 'Merchant_Category', 'amount',
       'fraud', 'Year', 'Month'],
      dtype='object')

### encoding and scaling(first learn it)

In [11]:
data=df.copy()

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Merchant_ID        647 non-null    object 
 1   Transaction_Type   647 non-null    object 
 2   Payment_Gateway    647 non-null    object 
 3   Transaction_State  647 non-null    object 
 4   IP_Address         647 non-null    object 
 5   Merchant_Category  647 non-null    object 
 6   amount             647 non-null    float64
 7   fraud              647 non-null    int64  
 8   Year               647 non-null    int32  
 9   Month              647 non-null    object 
dtypes: float64(1), int32(1), int64(1), object(7)
memory usage: 48.1+ KB


In [13]:
#converting year column to object
data['Year']=data['Year'].astype('object')

In [14]:
df.to_csv("cleaned_upi_data.csv", index=False)

PermissionError: [Errno 13] Permission denied: 'cleaned_upi_data.csv'

In [15]:
#extracting categorical columns from data
categorical_cols=data.select_dtypes(include=['object']).columns
categorical_cols

Index(['Merchant_ID', 'Transaction_Type', 'Payment_Gateway',
       'Transaction_State', 'IP_Address', 'Merchant_Category', 'Year',
       'Month'],
      dtype='object')

In [16]:
from sklearn.preprocessing import LabelEncoder

cols = ['Year', 'Month']
label_encoder = LabelEncoder()

for col in cols:
    data[col] = label_encoder.fit_transform(data[col])


In [17]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
cols = ['Year', 'Month']

for col in cols:
    data[col] = label_encoder.fit_transform(data[col])


In [18]:
#non ordinal variables
non_ordinal=['Transaction_Type','Payment_Gateway','Transaction_State','Merchant_Category']

#performing one hot encoding for non_ordinal variables
data=pd.get_dummies(data,columns=non_ordinal,drop_first=True)

#convert boolean columns to integers
data=data.astype(int)

ValueError: invalid literal for int() with base 10: 'f65a902b-2396-40cc-9593-97e103f1bc15'

In [19]:
#non ordinal variables
non_ordinal=['Transaction_Type','Payment_Gateway','Transaction_State','Merchant_Category']

#performing one hot encoding for non_ordinal variables
data=pd.get_dummies(data,columns=non_ordinal,drop_first=True)

# Only convert boolean columns to int
for col in data.select_dtypes(include='bool').columns:
    data[col] = data[col].astype(int)



KeyError: "None of [Index(['Transaction_Type', 'Payment_Gateway', 'Transaction_State',\n       'Merchant_Category'],\n      dtype='object')] are in the [columns]"

In [20]:
print("Available columns in data:")
print(data.columns)

print("Missing columns:")
print([col for col in non_ordinal if col not in data.columns])


Available columns in data:
Index(['Merchant_ID', 'IP_Address', 'amount', 'fraud', 'Year', 'Month',
       'Transaction_Type_Bill Payment', 'Transaction_Type_Investment',
       'Transaction_Type_Other', 'Transaction_Type_Purchase',
       'Transaction_Type_Refund', 'Transaction_Type_Subscription',
       'Payment_Gateway_Bank of Data', 'Payment_Gateway_CReditPAY',
       'Payment_Gateway_Dummy Bank', 'Payment_Gateway_Gamma Bank',
       'Payment_Gateway_Other', 'Payment_Gateway_SamplePay',
       'Payment_Gateway_Sigma Bank', 'Payment_Gateway_UPI Pay',
       'Transaction_State_Arunachal Pradesh', 'Transaction_State_Assam',
       'Transaction_State_Bihar', 'Transaction_State_Chhattisgarh',
       'Transaction_State_Goa', 'Transaction_State_Gujarat',
       'Transaction_State_Haryana', 'Transaction_State_Himachal Pradesh',
       'Transaction_State_Jharkhand', 'Transaction_State_Karnataka',
       'Transaction_State_Kerala', 'Transaction_State_Madhya Pradesh',
       'Transaction_State

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Merchant_ID        647 non-null    object 
 1   Transaction_Type   647 non-null    object 
 2   Payment_Gateway    647 non-null    object 
 3   Transaction_State  647 non-null    object 
 4   IP_Address         647 non-null    object 
 5   Merchant_Category  647 non-null    object 
 6   amount             647 non-null    float64
 7   fraud              647 non-null    int64  
 8   Year               647 non-null    int32  
 9   Month              647 non-null    object 
dtypes: float64(1), int32(1), int64(1), object(7)
memory usage: 48.1+ KB


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 56 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Merchant_ID                                     647 non-null    object 
 1   IP_Address                                      647 non-null    object 
 2   amount                                          647 non-null    float64
 3   fraud                                           647 non-null    int64  
 4   Year                                            647 non-null    int64  
 5   Month                                           647 non-null    int64  
 6   Transaction_Type_Bill Payment                   647 non-null    bool   
 7   Transaction_Type_Investment                     647 non-null    bool   
 8   Transaction_Type_Other                          647 non-null    bool   
 9   Transaction_Type_Purchase                  

In [23]:
# Convert boolean columns to int, if any
for col in data.select_dtypes(include='bool').columns:
    data[col] = data[col].astype(int)

# Proceed with model training / analysis


In [24]:
data.head()

Unnamed: 0,Merchant_ID,IP_Address,amount,fraud,Year,Month,Transaction_Type_Bill Payment,Transaction_Type_Investment,Transaction_Type_Other,Transaction_Type_Purchase,...,Transaction_State_West Bengal,Merchant_Category_Donations and Devotion,Merchant_Category_Financial services and Taxes,Merchant_Category_Home delivery,Merchant_Category_Investment,Merchant_Category_More Services,Merchant_Category_Other,Merchant_Category_Purchases,Merchant_Category_Travel bookings,Merchant_Category_Utilities
0,f65a902b-2396-40cc-9593-97e103f1bc15,140.213.7.48,396.62,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,d4a5efcb-4eb6-4d3a-8132-07bb3e6e13a4,184.108.177.45,121.94,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,759ad138-9473-4729-8699-3d72c7ffb983,16.106.248.163,106.69,1,0,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,d8f561e4-bded-4ef0-bcd8-5494b2e31a94,65.245.160.212,3611.11,1,1,4,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,26fdd7a1-8537-4dfe-bcf7-f5a127b36682,33.172.152.38,374.89,1,0,6,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [25]:
data.dtypes

Merchant_ID                                        object
IP_Address                                         object
amount                                            float64
fraud                                               int64
Year                                                int64
Month                                               int64
Transaction_Type_Bill Payment                       int32
Transaction_Type_Investment                         int32
Transaction_Type_Other                              int32
Transaction_Type_Purchase                           int32
Transaction_Type_Refund                             int32
Transaction_Type_Subscription                       int32
Payment_Gateway_Bank of Data                        int32
Payment_Gateway_CReditPAY                           int32
Payment_Gateway_Dummy Bank                          int32
Payment_Gateway_Gamma Bank                          int32
Payment_Gateway_Other                               int32
Payment_Gatewa

In [26]:
# Drop Merchant_ID and IP_Address
data.drop(columns=['Merchant_ID', 'IP_Address'], inplace=True)

# Convert 'amount' to int64 (removes decimal places)
data['amount'] = data['amount'].astype(int)





In [27]:
print(data.dtypes)

amount                                            int32
fraud                                             int64
Year                                              int64
Month                                             int64
Transaction_Type_Bill Payment                     int32
Transaction_Type_Investment                       int32
Transaction_Type_Other                            int32
Transaction_Type_Purchase                         int32
Transaction_Type_Refund                           int32
Transaction_Type_Subscription                     int32
Payment_Gateway_Bank of Data                      int32
Payment_Gateway_CReditPAY                         int32
Payment_Gateway_Dummy Bank                        int32
Payment_Gateway_Gamma Bank                        int32
Payment_Gateway_Other                             int32
Payment_Gateway_SamplePay                         int32
Payment_Gateway_Sigma Bank                        int32
Payment_Gateway_UPI Pay                         

### To improve the computational efficiency,taking 50% of entire data for model building and testing keeping the ratio intact

In [28]:
from sklearn.model_selection import train_test_split

#split the data into 'fraud' and 'non-fraud' classes
non_fraud_data=data[data['fraud']==1]
non_fraud_data=data[data['fraud']==0]

#sample each class separately while maintaining proportions 
sampled_fraud_data=fraud_data.sample(frac=0.5, random_state=42)
sampled_non_fraud_data=non_fraud_data.sample(frac=0.5,random_state=42)

#combine the sample data back together
data=pd.concat([sampled_fraud_data,sampled_non_fraud_data])

NameError: name 'fraud_data' is not defined

In [29]:
from sklearn.model_selection import train_test_split

# Split the data into fraud and non-fraud classes
fraud_data = data[data['fraud'] == 1]
non_fraud_data = data[data['fraud'] == 0]

# Sample each class separately (e.g., 50% of each class)
sampled_fraud_data = fraud_data.sample(frac=0.5, random_state=42)
sampled_non_fraud_data = non_fraud_data.sample(frac=0.5, random_state=42)

# Combine the sampled data back together
data = pd.concat([sampled_fraud_data, sampled_non_fraud_data]).sample(frac=1, random_state=42).reset_index(drop=True)


In [30]:
print(data['fraud'].value_counts()) #cross checking class balance


fraud
0    246
1     78
Name: count, dtype: int64


# MODEL BUILDING

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score,roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier  #import XHBoostclassifier



In [32]:
#Now split the sample data into training and testing sets with a test size of 30%
X_train,X_test,y_test=train_test_split(X,y,test_size=0.2, random_state=42,stratify=y)

#define a function to evaluate a model and return a dictionary
def evaluate_model(model,X_test, y_test):
    #calculate preictions
    y_pred=model.predict(X_test)

    #computer metrics
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    recall=recall_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)
    roc_auc = roc_auc_score(y_test,y_pred)

    #return metrics as a dictionary
    return{
    'Model': type(model)._name_,
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score':f1,
    'ROC AUC Score': roc_auc
    }

    #define models without support vector machine(SVM)
    models={
        "Decision Tree":DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "Gradient Boosting":GradientBoostingClassifier(),
        "XGBoost":XGBClassifier(), #added XGBoost to the dictionary
    }

    #List to store evaluation resulsts
    results=[]

    #Model Training and evaluation
    for model_name, model in models.items():
        print(model_name + ":")
        model.fit(X_train, y_train)
        metrics=evaluate_model(model,X_test,y_test)
        results.append(metrics)

   #Create a dataframe from the results 
df_results=pd.DataFrame(results)

   #Display the dataframe
df_results
    

NameError: name 'X' is not defined

In [33]:
print(df.columns)


Index(['Merchant_ID', 'Transaction_Type', 'Payment_Gateway',
       'Transaction_State', 'IP_Address', 'Merchant_Category', 'amount',
       'fraud', 'Year', 'Month'],
      dtype='object')


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import pandas as pd

# Define features and target
X = df.drop(columns=['fraud'])
y = df['fraud']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define a function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    return {
        'Model': type(model).__name__,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC Score': roc_auc_score(y_test, y_pred)
    }

# Define models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier()
}

# Evaluate each model
results = []

for name, model in models.items():
    print(f"Training: {name}")
    model.fit(X_train, y_train)
    metrics = evaluate_model(model, X_test, y_test)
    results.append(metrics)

# Display results
df_results = pd.DataFrame(results)
print(df_results)


Training: Decision Tree


ValueError: could not convert string to float: '3a253ac5-d9a1-430b-9157-ba64de47b1fb'

In [35]:
import time

for name, model in models.items():
    print(f"Training: {name}")
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()
    print(f"Finished training: {name} in {end - start:.2f} seconds")


Training: Decision Tree


ValueError: could not convert string to float: '3a253ac5-d9a1-430b-9157-ba64de47b1fb'

In [36]:
print(X_train.dtypes)


Merchant_ID           object
Transaction_Type      object
Payment_Gateway       object
Transaction_State     object
IP_Address            object
Merchant_Category     object
amount               float64
Year                   int32
Month                 object
dtype: object


In [37]:
X = pd.get_dummies(X, drop_first=True)


In [38]:
y = df['fraud']


In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import pandas as pd

# Convert object columns to numeric
X = pd.get_dummies(df.drop(columns=['fraud']), drop_first=True)
y = df['fraud']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Function to evaluate a model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    return {
        'Model': type(model).__name__,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC Score': roc_auc_score(y_test, y_pred)
    }

# Define models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Train and evaluate models
results = []

for name, model in models.items():
    print(f"Training: {name}")
    model.fit(X_train, y_train)
    metrics = evaluate_model(model, X_test, y_test)
    results.append(metrics)

# Show results
df_results = pd.DataFrame(results)
print(df_results)


Training: Decision Tree
Training: Random Forest
Training: Gradient Boosting
Training: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


                        Model  Accuracy  Precision    Recall  F1 Score  \
0      DecisionTreeClassifier  0.938462   0.870968  0.870968  0.870968   
1      RandomForestClassifier  0.946154   0.900000  0.870968  0.885246   
2  GradientBoostingClassifier  0.946154   0.852941  0.935484  0.892308   
3               XGBClassifier  0.938462   0.848485  0.903226  0.875000   

   ROC AUC Score  
0       0.915282  
1       0.920332  
2       0.942489  
3       0.926360  


In [41]:
# OLD: triggers warning
XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# NEW: no warning
XGBClassifier(eval_metric='logloss')


# DATA BALANCING

### oversampling(SMOTE)

In [42]:
X=data.drop('fraud',axis=1)
y=data['fraud']

In [43]:
X.shape

(324, 53)

In [44]:
y.shape

(324,)

In [45]:
from imblearn.over_sampling import SMOTE

#using smote to oversample the minority class
X_res,y_res=SMOTE().fit_resample(X,y)

#print the count of each class after oversampling
print(y_res.value_counts())
                                

fraud
0    246
1    246
Name: count, dtype: int64


In [46]:
from imblearn.over_sampling import SMOTE

X_res, y_res = SMOTE().fit_resample(X, y)

print(y_res.value_counts())


fraud
0    246
1    246
Name: count, dtype: int64


In [47]:
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
import pandas as pd

# demo data
X, y = make_classification(n_samples=1000, n_features=5, weights=[0.9, 0.1])
X_res, y_res = SMOTE().fit_resample(X, y)

print(pd.Series(y_res).value_counts())


0    895
1    895
Name: count, dtype: int64


In [48]:
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
import pandas as pd

X, y = make_classification(n_samples=1000, n_features=5, weights=[0.9, 0.1])
X_res, y_res = SMOTE().fit_resample(X, y)

print(pd.Series(y_res).value_counts())


0    896
1    896
Name: count, dtype: int64


In [49]:
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
import pandas as pd

X, y = make_classification(n_samples=1000, n_features=5, weights=[0.9, 0.1])
X_res, y_res = SMOTE().fit_resample(X, y)

print(pd.Series(y_res).value_counts())


1    896
0    896
Name: count, dtype: int64


In [50]:
from imblearn.over_sampling import SMOTE

#using smote to oversample the minority class
X_res,y_res=SMOTE().fit_resample(X,y)

#print the count of each class after oversampling
print(y_res.value_counts())
                                

AttributeError: 'numpy.ndarray' object has no attribute 'value_counts'

In [51]:

!where python 


c:\Program Files\Python311\python.exe
C:\Program Files\Python313\python.exe
C:\Users\Administrator\AppData\Local\Microsoft\WindowsApps\python.exe


In [52]:
import sys
print(sys.executable)


c:\Program Files\Python311\python.exe


In [53]:
import sys
!{sys.executable} -m pip install --pre --upgrade scikit-learn


'c:\Program' is not recognized as an internal or external command,
operable program or batch file.


In [54]:
import sys
!"{sys.executable}" -m pip install --pre --upgrade scikit-learn


Collecting scikit-learn
  Using cached scikit_learn-1.7.0-cp311-cp311-win_amd64.whl.metadata (14 kB)
Using cached scikit_learn-1.7.0-cp311-cp311-win_amd64.whl (10.7 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.7.0


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.0 which is incompatible.


In [55]:
import sys
!"{sys.executable}" -m pip install imbalanced-learn


Collecting scikit-learn<2,>=1.3.2 (from imbalanced-learn)
  Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.7.0
    Uninstalling scikit-learn-1.7.0:
      Successfully uninstalled scikit-learn-1.7.0
Successfully installed scikit-learn-1.6.1


In [56]:
from imblearn.over_sampling import SMOTE

X_res, y_res = SMOTE().fit_resample(X, y)
print(y_res.value_counts())


AttributeError: 'numpy.ndarray' object has no attribute 'value_counts'

In [1]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Sample setup
data['Year'] = LabelEncoder().fit_transform(data['Year'])

# SMOTE
X_res, y_res = SMOTE().fit_resample(X, y)
print(y_res.value_counts())


NameError: name 'data' is not defined

In [60]:
import pandas as pd

# Example if loading from CSV:
data = pd.read_csv("cleaned_upi_data.csv")

# Now transform the Year column
data['Year'] = LabelEncoder().fit_transform(data['Year'])


In [61]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Sample setup
data['Year'] = LabelEncoder().fit_transform(data['Year'])

# SMOTE
X_res, y_res = SMOTE().fit_resample(X, y)
print(y_res.value_counts())


AttributeError: 'numpy.ndarray' object has no attribute 'value_counts'

In [62]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification

# Create dummy classification dataset
X, y = make_classification(n_samples=100, n_features=4, weights=[0.9, 0.1], random_state=42)

# Apply SMOTE
X_res, y_res = SMOTE().fit_resample(X, y)

# Print class distribution after SMOTE
print(pd.Series(y_res).value_counts())


0    90
1    90
Name: count, dtype: int64


In [63]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import pandas as pd

# Convert object columns to numeric
X = pd.get_dummies(df.drop(columns=['fraud']), drop_first=True)
y = df['fraud']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Function to evaluate a model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    return {
        'Model': type(model).__name__,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC Score': roc_auc_score(y_test, y_pred)
    }

# Define models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Train and evaluate models
results = []

for name, model in models.items():
    print(f"Training: {name}")
    model.fit(X_train, y_train)
    metrics = evaluate_model(model, X_test, y_test)
    results.append(metrics)

# Show results
df_results = pd.DataFrame(results)
print(df_results)


Training: Decision Tree
Training: Random Forest
Training: Gradient Boosting
Training: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


                        Model  Accuracy  Precision    Recall  F1 Score  \
0      DecisionTreeClassifier  0.930769   0.866667  0.838710  0.852459   
1      RandomForestClassifier  0.961538   0.906250  0.935484  0.920635   
2  GradientBoostingClassifier  0.938462   0.828571  0.935484  0.878788   
3               XGBClassifier  0.938462   0.848485  0.903226  0.875000   

   ROC AUC Score  
0       0.899153  
1       0.952590  
2       0.937439  
3       0.926360  


In [66]:
import plotly.express as px

#sort the dataframe by f1 scorein descending order
df_results_sorted=df_results.sort_values(by='F1 Score',ascending=False)

#update layouts to display subplots
fig.update_layout(
    title='F1 Score of Models',
    xaxis_title='Model',
    yaxis_title='F1 Score',
    xaxis={'categoryorder':'total descending'},
    margin=dict(l=20 ,r=20, t=50, b=20),
)

#show the plot
fig.show()

NameError: name 'fig' is not defined

In [67]:
import plotly.express as px

# Create bar chart using sorted dataframe
fig = px.bar(
    df_results_sorted,
    x='Model',
    y='F1 Score',
    text='F1 Score',
    title='F1 Score of Models'
)

# Update layout (now fig is defined)
fig.update_layout(
    xaxis_title='Model',
    yaxis_title='F1 Score',
    xaxis={'categoryorder': 'total descending'},
    margin=dict(l=20, r=20, t=50, b=20),
)

# Show the plot
fig.show()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [69]:
import plotly.express as px

# Sort the DataFrame
df_results_sorted = df_results.sort_values(by='F1 Score', ascending=False)

# Create bar plot
fig = px.bar(
    df_results_sorted,
    x='Model',
    y='F1 Score',
    text='F1 Score',
    title='F1 Score of Models',
    color='Model'
)

# Update layout
fig.update_layout(
    xaxis_title='Model',
    yaxis_title='F1 Score',
    xaxis={'categoryorder': 'total descending'},
    margin=dict(l=20, r=20, t=50, b=20),
)

# Show the plot
fig.show()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [70]:
import plotly.express as px

# Create bar chart using sorted dataframe
fig = px.bar(
    df_results_sorted,
    x='Model',
    y='F1 Score',
    text='F1 Score',
    title='F1 Score of Models'
)

# Update layout (now fig is defined)
fig.update_layout(
    xaxis_title='Model',
    yaxis_title='F1 Score',
    xaxis={'categoryorder': 'total descending'},
    margin=dict(l=20, r=20, t=50, b=20),
)

# Show the plot
fig.show()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [71]:
import plotly.express as px

# Sort results
df_results_sorted = df_results.sort_values(by='F1 Score', ascending=False)

# Create the bar plot
fig = px.bar(
    df_results_sorted,
    x='Model',
    y='F1 Score',
    text='F1 Score',
    title='F1 Score of Models',
    color='Model'
)

# Customize layout
fig.update_layout(
    xaxis_title='Model',
    yaxis_title='F1 Score',
    xaxis={'categoryorder': 'total descending'},
    margin=dict(l=20, r=20, t=50, b=20),
)

# Show the plot
fig.show()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [72]:
import plotly.express as px
import plotly.io as pio

# Use browser renderer to avoid nbformat issues
pio.renderers.default = 'browser'

# Assuming df_results_sorted is already created
df_results_sorted = df_results.sort_values(by='F1 Score', ascending=False)

# Create plot
fig = px.bar(
    df_results_sorted,
    x='Model',
    y='F1 Score',
    text='F1 Score',
    title='F1 Score of Models',
    color='Model'
)

# Update layout
fig.update_layout(
    xaxis_title='Model',
    yaxis_title='F1 Score',
    xaxis={'categoryorder': 'total descending'},
    margin=dict(l=20, r=20, t=50, b=20),
)

# Show the plot (this will open in browser)
fig.show()


In [73]:
import sys
!{sys.executable} -m pip install nbformat --upgrade


'c:\Program' is not recognized as an internal or external command,
operable program or batch file.


In [74]:
import sys
import os

os.system(f'"{sys.executable}" -m pip install nbformat --upgrade')


0

In [57]:
import plotly.express as px

# Sample sorted results
df_results_sorted = df_results.sort_values(by='F1 Score', ascending=False)

fig = px.bar(
    df_results_sorted,
    x='Model',
    y='F1 Score',
    text='F1 Score',
    title='F1 Score of Models',
    color='Model'
)

fig.update_layout(
    xaxis_title='Model',
    yaxis_title='F1 Score',
    xaxis={'categoryorder': 'total descending'},
    margin=dict(l=20, r=20, t=50, b=20),
)

fig.show()
