In [1]:
from google.colab import files

files.upload()  # Upload kaggle.json here

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"gurrammoneshreddy","key":"07c1fc0cfb350a06adaba20bc6321c0d"}'}

In [2]:
!pip install kaggle
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



In [3]:
!kaggle datasets download -d kartik2112/fraud-detection
!unzip fraud-detection.zip

Dataset URL: https://www.kaggle.com/datasets/kartik2112/fraud-detection
License(s): CC0-1.0
Downloading fraud-detection.zip to /content
 93% 187M/202M [00:01<00:00, 158MB/s]
100% 202M/202M [00:01<00:00, 140MB/s]
Archive:  fraud-detection.zip
  inflating: fraudTest.csv           
  inflating: fraudTrain.csv          


In [5]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 📌 *1. Load Data*
train_df = pd.read_csv("/content/fraudTrain.csv")
test_df = pd.read_csv("/content/fraudTest.csv")

# 📌 *2. Remove Null Values*
train_df = train_df.dropna()
test_df = test_df.dropna()

# 📌 *3. Drop Unnecessary Columns*
drop_cols = ['trans_date_trans_time', 'cc_num', 'first', 'last', 'street', 'city',
             'state', 'zip', 'job', 'dob', 'trans_num']
train_df = train_df.drop(columns=drop_cols)
test_df = test_df.drop(columns=drop_cols)

# 📌 *4. Convert Categorical Features*
encoder = LabelEncoder()
categorical_cols = ['merchant', 'category', 'gender']
for col in categorical_cols:
    train_df[col] = encoder.fit_transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

print("\n✅ Categorical Features Encoded!")

# 📌 *5. Select Important Features*
correlation_matrix = train_df.corr()
top_features = correlation_matrix["is_fraud"].abs().sort_values(ascending=False).index[1:7].tolist()

print("\n✅ Selected Features:", top_features)

# 📌 *6. Define Features & Target*
X = train_df[top_features]
y = train_df['is_fraud']

# 📌 *7. Handle Class Imbalance Using SMOTE*
smote = SMOTE(sampling_strategy=0.3, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("\n✅ SMOTE Applied: Balanced Data")

# 📌 *8. Train-Test Split*
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 📌 *9. Scale Numerical Features*
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# ✅ Save the scaler for future predictions
joblib.dump(scaler, "scaler.pkl")

print("✅ Data Preprocessing Done!")

# 📌 *10. Train Logistic Regression Model*
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

# 📌 *11. Evaluate Model*
y_pred = logistic_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"\n🔹 Logistic Regression Accuracy: {accuracy:.4f}")
print(classification_report(y_val, y_pred))

# 📌 *12. Save the Model*
joblib.dump(logistic_model, "best_fraud_model.pkl")
print("\n✅ Logistic Regression Model Saved as 'best_fraud_model.pkl'")


✅ Categorical Features Encoded!

✅ Selected Features: ['amt', 'category', 'gender', 'unix_time', 'Unnamed: 0', 'city_pop']

✅ SMOTE Applied: Balanced Data
✅ Data Preprocessing Done!

🔹 Logistic Regression Accuracy: 0.9186
              precision    recall  f1-score   support

           0       0.92      0.98      0.95    257613
           1       0.93      0.70      0.80     77571

    accuracy                           0.92    335184
   macro avg       0.92      0.84      0.87    335184
weighted avg       0.92      0.92      0.91    335184


✅ Logistic Regression Model Saved as 'best_fraud_model.pkl'


In [6]:
import joblib
import numpy as np
import time
import datetime

# ✅ Load the trained model
model = joblib.load("best_fraud_model.pkl")

# ✅ Load the scaler
scaler = joblib.load("scaler.pkl")

# ✅ Define Features Used During Training
top_features = ['amt', 'category', 'gender', 'unix_time', 'Unnamed: 0', 'city_pop']

# 📌 Take manual inputs
print("\n🔹 Enter Transaction Details 🔹")
manual_input = []

# Get transaction amount
amt = float(input("Enter Transaction Amount: "))
manual_input.append(amt)

# Get category (Already Encoded)
category = int(input("Enter Category (Encoded Value): "))
manual_input.append(category)

# Get gender (Encoded: 0 = Female, 1 = Male)
gender = int(input("Enter Gender (0 = Female, 1 = Male): "))
manual_input.append(gender)

# Get normal time and convert to Unix time
date_str = input("Enter Transaction Date & Time (YYYY-MM-DD HH:MM:SS): ")
unix_time = int(time.mktime(datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S").timetuple()))
manual_input.append(unix_time)

# Get index (Unnamed: 0)
index = int(input("Enter Index (Unnamed: 0): "))
manual_input.append(index)

# Get city population
city_pop = int(input("Enter City Population: "))
manual_input.append(city_pop)

# 📌 Convert input to numpy array & reshape
manual_input = np.array(manual_input).reshape(1, -1)

# 📌 Scale the features
manual_input_scaled = scaler.transform(manual_input)

# 📌 Make a prediction
prediction = model.predict(manual_input_scaled)

# 📌 Convert result to human-readable format
result = "Fraudulent Transaction 🚨" if prediction[0] == 1 else "Legitimate Transaction ✅"

# 📌 Display the prediction
print("\n🟢 Prediction:", result)


🔹 Enter Transaction Details 🔹
Enter Transaction Amount: 100000
Enter Category (Encoded Value): 3
Enter Gender (0 = Female, 1 = Male): 1
Enter Transaction Date & Time (YYYY-MM-DD HH:MM:SS): 2025-02-22  22:22:22
Enter Index (Unnamed: 0): 2
Enter City Population: 123654656

🟢 Prediction: Fraudulent Transaction 🚨




In [7]:
import joblib

# Save the trained model
joblib.dump(model, "fraud_detection_model.pkl")

print("✅ Model saved as 'fraud_detection_model.pkl'")

✅ Model saved as 'fraud_detection_model.pkl'
