In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # Correct import here
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle
import numpy as np
from scipy.sparse import csr_matrix

# Load the training and testing datasets
train_data = pd.read_csv(r"C:\Users\aksha\OneDrive\Desktop\fraudTrain.csv")
test_data = pd.read_csv(r"C:\Users\aksha\OneDrive\Desktop\fraudTest.csv")

# Convert 'dob' column to datetime object
train_data['dob'] = pd.to_datetime(train_data['dob'], format='%d-%m-%Y')
test_data['dob'] = pd.to_datetime(test_data['dob'], format='%Y-%m-%d')

# Convert the Date-Time string to datetime object
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'],format="%d-%m-%Y %H:%M", dayfirst=True)
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time'])

# Calculate age at the time of transaction
train_data['age'] = (train_data['trans_date_trans_time'] - train_data['dob']).dt.days // 365
test_data['age'] = (test_data['trans_date_trans_time'] - test_data['dob']).dt.days // 365

# Drop the columns as it's no longer needed
train_data = train_data.drop(columns=['dob', 'trans_num', 'cc_num'])
test_data = test_data.drop(columns=['dob', 'trans_num', 'cc_num'])

# Extract useful features from the datetime column
train_data['Year'] = train_data['trans_date_trans_time'].dt.year
train_data['Month'] = train_data['trans_date_trans_time'].dt.month
train_data['Day'] = train_data['trans_date_trans_time'].dt.day
train_data['Hour'] = train_data['trans_date_trans_time'].dt.hour
train_data['Minute'] = train_data['trans_date_trans_time'].dt.minute

test_data['Year'] = test_data['trans_date_trans_time'].dt.year
test_data['Month'] = test_data['trans_date_trans_time'].dt.month
test_data['Day'] = test_data['trans_date_trans_time'].dt.day
test_data['Hour'] = test_data['trans_date_trans_time'].dt.hour
test_data['Minute'] = test_data['trans_date_trans_time'].dt.minute

# Drop the original datetime column if it's no longer needed
train_data = train_data.drop(columns=['trans_date_trans_time'])
test_data = test_data.drop(columns=['trans_date_trans_time'])

# Identify categorical columns 
categorical_columns = ['merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job']  # replace with actual names

# Apply frequency encoding
def frequency_encoding(df, col):
    freq = df[col].value_counts()
    df[col] = df[col].map(freq)
    return df

for col in categorical_columns:
    train_data = frequency_encoding(train_data, col)
    test_data = frequency_encoding(test_data, col)
    
# Feature selection (separate features and target variable)
X_train = csr_matrix(train_data.drop(columns=['is_fraud'])) 
y_train = train_data['is_fraud']  

X_test = csr_matrix(test_data.drop(columns=['is_fraud']))  
y_test = test_data['is_fraud'] 

# Sample a subset of data if too large
# You can adjust the size according to your system's memory
sample_size = 100000 
if X_train.shape[0] > sample_size:
    X_train, y_train = X_train[:sample_size], y_train[:sample_size]

# Convert to sparse matrix early
X_train = csr_matrix(X_train)
X_test = csr_matrix(X_test)

# Handle missing values in y_train
y_train = y_train.dropna()  # Remove rows with missing target values
X_train = X_train[~y_train.index]  # Ensure alignment with X_train

# Address is_fraud imbalance in the training data
rus = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

# Scale the features
scaler = StandardScaler(with_mean=False)
X_train_res = scaler.fit_transform(X_train_res)
X_test = scaler.transform(X_test)

# Train a model (Random Forest in this example)
model = RandomForestClassifier(random_state=42)
model.fit(X_train_res, y_train_res)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9956164896287513
[[553283    291]
 [  2145      0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719



In [5]:
#Importing Libraries

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle
import numpy as np
from scipy.sparse import csr_matrix


In [13]:
# Load the training and testing datasets

train_data = pd.read_csv(r"C:\Users\aksha\OneDrive\Desktop\fraudTrain.csv")
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0.0,01-01-2019 00:00,2703190000000000.0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495.0,"Psychologist, counselling",09-03-1988,0b242abb623afc578575680df30655b9,1325376000.0,36.011293,-82.048315,0.0
1,1.0,01-01-2019 00:00,630423000000.0,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149.0,Special educational needs teacher,21-06-1978,1f76529f8574734946361c461b024d99,1325376000.0,49.159047,-118.186462,0.0
2,2.0,01-01-2019 00:00,38859500000000.0,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154.0,Nature conservation officer,19-01-1962,a1a22d70485983eac12b5b88dad1cf95,1325376000.0,43.150704,-112.154481,0.0
3,3.0,01-01-2019 00:01,3534090000000000.0,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939.0,Patent attorney,12-01-1967,6b849c168bdad6f867558c3793159a81,1325376000.0,47.034331,-112.561071,0.0
4,4.0,01-01-2019 00:03,375534000000000.0,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99.0,Dance movement psychotherapist,28-03-1986,a41d7549acf90789359a9aa5346dcb46,1325376000.0,38.674999,-78.632459,0.0


In [17]:
# Load the training and testing datasets

test_data = pd.read_csv(r"C:\Users\aksha\OneDrive\Desktop\fraudTest.csv")
test_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [19]:
#Data Preprocessing 

# Convert 'dob' column to datetime object
train_data['dob'] = pd.to_datetime(train_data['dob'], format='%d-%m-%Y')
test_data['dob'] = pd.to_datetime(test_data['dob'], format='%Y-%m-%d')

# Convert the Date-Time string to datetime object
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'], format="%d-%m-%Y %H:%M", dayfirst=True)
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time'])

# Calculate age at the time of transaction
train_data['age'] = (train_data['trans_date_trans_time'] - train_data['dob']).dt.days // 365
test_data['age'] = (test_data['trans_date_trans_time'] - test_data['dob']).dt.days // 365

# Drop the columns as they are no longer needed
train_data = train_data.drop(columns=['dob', 'trans_num', 'cc_num'])
test_data = test_data.drop(columns=['dob', 'trans_num', 'cc_num'])


In [21]:
#Feature Engineering

# Extract useful features from the datetime column
train_data['Year'] = train_data['trans_date_trans_time'].dt.year
train_data['Month'] = train_data['trans_date_trans_time'].dt.month
train_data['Day'] = train_data['trans_date_trans_time'].dt.day
train_data['Hour'] = train_data['trans_date_trans_time'].dt.hour
train_data['Minute'] = train_data['trans_date_trans_time'].dt.minute

test_data['Year'] = test_data['trans_date_trans_time'].dt.year
test_data['Month'] = test_data['trans_date_trans_time'].dt.month
test_data['Day'] = test_data['trans_date_trans_time'].dt.day
test_data['Hour'] = test_data['trans_date_trans_time'].dt.hour
test_data['Minute'] = test_data['trans_date_trans_time'].dt.minute

# Drop the original datetime column if it's no longer needed
train_data = train_data.drop(columns=['trans_date_trans_time'])
test_data = test_data.drop(columns=['trans_date_trans_time'])


In [23]:
#Categorical Encoding

# Identify categorical columns 
categorical_columns = ['merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job']

# Apply frequency encoding
def frequency_encoding(df, col):
    freq = df[col].value_counts()
    df[col] = df[col].map(freq)
    return df

for col in categorical_columns:
    train_data = frequency_encoding(train_data, col)
    test_data = frequency_encoding(test_data, col)


In [25]:
#Feature Selection and Sampling

# Feature selection 
X_train = csr_matrix(train_data.drop(columns=['is_fraud'])) 
y_train = train_data['is_fraud']  

X_test = csr_matrix(test_data.drop(columns=['is_fraud']))  
y_test = test_data['is_fraud'] 

# Sample a subset of data if too large
sample_size = 100000 
if X_train.shape[0] > sample_size:
    X_train, y_train = X_train[:sample_size], y_train[:sample_size]

# Convert to sparse matrix early
X_train = csr_matrix(X_train)
X_test = csr_matrix(X_test)


In [27]:
# Handle missing values in y_train
y_train = y_train.dropna()  # Remove rows with missing target values
X_train = X_train[~y_train.index]  # Ensure alignment with X_train

# Address is_fraud imbalance in the training data
rus = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)


In [29]:
#Scaling, Model Training, and Evaluation

# Scale the features
scaler = StandardScaler(with_mean=False)
X_train_res = scaler.fit_transform(X_train_res)
X_test = scaler.transform(X_test)

# Train a model (Random Forest in this example)
model = RandomForestClassifier(random_state=42)
model.fit(X_train_res, y_train_res)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9956164896287513
[[553283    291]
 [  2145      0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719

