In [None]:
#Data Exploration and Preprocessing:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('c:\\creditcard.csv')

# Explore the dataset
print(data.head())
print(data.info())
print(data.describe())

# Handle missing values
data.dropna(inplace=True)

# Analyze target variable distribution
target_counts = data['Amount'].value_counts()  # Replace 'fraud_target_column' with the actual target column name
print(target_counts)
sns.countplot(x='Amount', data=data)

# Remove outliers (if needed)
# Define outlier removal logic
# For example, you can use z-score based outlier removal
from scipy.stats import zscore
z_scores = zscore(data['transaction_amount'])
data = data[(z_scores < 3)]  # Keep only rows with z-score < 3

# Scale numerical variables
# You might want to scale the numerical variables if the algorithm requires it
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data[['transaction_amount', 'time', 'numerical_feature1', 'numerical_feature2']] = scaler.fit_transform(data[['transaction_amount', 'time', 'numerical_feature1', 'numerical_feature2']])

# Now the data is explored, missing values are handled, outliers are removed, and numerical variables are scaled.
# You can proceed to the next tasks.


   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [1]:
#Feature Engineering:
import pandas as pd

# Load the dataset
data = pd.read_csv('c:\creditcard.csv')

# Create new features
data['half_amount'] = data['Amount'].apply(lambda x: int(x / 2))

# Creating statistical features
data['total_amount_std'] = data.groupby('half_amount')['Amount'].transform('std')
print(data.info())

# You can add more feature engineering steps based on your analysis


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 33 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Time              284807 non-null  float64
 1   V1                284807 non-null  float64
 2   V2                284807 non-null  float64
 3   V3                284807 non-null  float64
 4   V4                284807 non-null  float64
 5   V5                284807 non-null  float64
 6   V6                284807 non-null  float64
 7   V7                284807 non-null  float64
 8   V8                284807 non-null  float64
 9   V9                284807 non-null  float64
 10  V10               284807 non-null  float64
 11  V11               284807 non-null  float64
 12  V12               284807 non-null  float64
 13  V13               284807 non-null  float64
 14  V14               284807 non-null  float64
 15  V15               284807 non-null  float64
 16  V16               28

In [3]:
#Model Selection and Training:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score, classification_report, confusion_matrix, precision_score, recall_score
import plotly.express as px

# Load the dataset
data = pd.read_csv('c:\creditcard.csv')

# Assuming you have already performed data preprocessing, feature engineering, and defined X and y

# Define X and y based on your dataset
X = data.drop('Class', axis=1)  # Drop the target variable
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
svc = SVC()
nb = GaussianNB()

models = [svc, nb]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=1)  # Set zero_division to 1
    recall = recall_score(y_test, y_pred)
    
    print(type(model).__name__, "Model Test Accuracy Score is:", accuracy)
    print(type(model).__name__, "Model Test F1 Score is:", f1)
    print(type(model).__name__, "Model Test Precision Score is:", precision)
    print(type(model).__name__, "Model Test Recall Score is:", recall)

SVC Model Test Accuracy Score is: 0.9982795547909132
SVC Model Test F1 Score is: 0.0
SVC Model Test Precision Score is: 1.0
SVC Model Test Recall Score is: 0.0
GaussianNB Model Test Accuracy Score is: 0.9930128857835048
GaussianNB Model Test F1 Score is: 0.2375478927203065
GaussianNB Model Test Precision Score is: 0.14622641509433962
GaussianNB Model Test Recall Score is: 0.6326530612244898


In [None]:
#Model Evaluation and Fine-tuning:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

# Load the dataset
data = pd.read_csv('c:\creditcard.csv')

# Define X and y based on your dataset
X = data.drop('Class', axis=1)  # Drop the target variable
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier model
model = RandomForestClassifier(random_state=42)

# Define hyperparameter grid for RandomForestClassifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model with the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score from GridSearchCV
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Get predictions on the test data
y_pred = grid_search.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print("Best Parameters:", best_params)
print("Best Score:", best_score)
print("Test Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)


In [None]:
#Model Deployment and Prediction:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load the trained model
model = joblib.load('models/trained_model.pkl')

# Function to predict fraud
def predict_fraud(transaction_data):
    prediction = model.predict([transaction_data])
    if prediction == 1:
        return "Fraudulent"
    else:
        return "Non-Fraudulent"

# Command-line interface
def main():
    print("Credit Card Fraud Detection")
    print("--------------------------")

    while True:
        try:
            amount = float(input("Enter transaction amount: "))
            time = float(input("Enter transaction time: "))
            feature1 = float(input("Enter numerical feature 1: "))
            feature2 = float(input("Enter numerical feature 2: "))
            
            transaction_data = [amount, time, feature1, feature2]

            prediction = predict_fraud(transaction_data)
            print(f"The transaction is predicted to be: {prediction}")
            
            another = input("Do you want to predict another transaction? (yes/no): ")
            if another.lower() != 'yes':
                break
        except ValueError:
            print("Invalid input. Please enter numeric values.")

if __name__ == "__main__":
    main()
