In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# **Loading the Dataset**

In [None]:
df = pd.read_csv('/content/creditcard_dataset.csv')
df.info()

In [None]:
df=df.dropna()

In [None]:
df.isnull().sum()

In [None]:
fraud_counts = df['is_fraud'].value_counts()
sns.barplot(x=fraud_counts.index, y=fraud_counts.values)
plt.title('Distribution of Fraud')
plt.xlabel('Fraud')
plt.ylabel('Count')
for i, count in enumerate(fraud_counts.values):
    plt.text(i, count + 50, str(count), ha='center', va='bottom')
plt.show()

In [None]:
df.duplicated().sum()

# **Data Preprocessing**

In [None]:
fraud=df[df["is_fraud"]==1]
not_fraud=df[df["is_fraud"]==0]
print(fraud.shape[0])
print(not_fraud.shape[0])

The Below code is balancing an imbalanced dataset by undersampling the majority class (non-fraudulent transactions) to match the number of samples in the minority class (fraudulent transactions). First, it randomly selects a subset of non-fraudulent samples to have the same number of rows as the fraudulent samples. This way, the not_fraud DataFrame now has an equal number of rows to fraud. Then, it combines this subset of non-fraudulent samples with the fraudulent samples to create a new balanced dataset, data. This approach, known as undersampling, helps ensure that the model doesnâ€™t become biased toward predicting the majority class (non-fraud), thereby improving its ability to detect fraud accurately.

In [None]:
not_fraud=not_fraud.sample(fraud.shape[0])
data=pd.concat([fraud,not_fraud])

Graph Reprasentation of New Distribution

In [None]:
fraud_counts=data['is_fraud'].value_counts()
sns.barplot(x=fraud_counts.index,y=fraud_counts.values)
plt.title('New Distribution of Fraud')
plt.xlabel('Fraud')
plt.ylabel('Count')
for i, count in enumerate(fraud_counts.values):
    plt.text(i, count + 50, str(count), ha='center', va='bottom')
plt.show()

# **Feature Engineering**

In [None]:
unused_cols=['Unnamed: 0','first','last','unix_time','street','gender','job','dob','city','state','trans_num','merchant']
data.drop(columns=unused_cols,inplace=True)

In [None]:
data.info()

In [None]:
data['trans_date_trans_time']=pd.to_datetime(data['trans_date_trans_time'])
data['trans_day']=data['trans_date_trans_time'].dt.day
data['trans_month']=data['trans_date_trans_time'].dt.month
data['trans_year']=data['trans_date_trans_time'].dt.year
data['trans_hour']=data['trans_date_trans_time'].dt.hour
data['trans_minute']=data['trans_date_trans_time'].dt.minute
data.drop(columns=['trans_date_trans_time'],inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
data['category']=encoder.fit_transform(data['category'])
data['cc_num']=encoder.fit_transform(data['cc_num'])
data.head()

In [None]:
scaler=StandardScaler()
data['amt']=scaler.fit_transform(data[['amt']])
data['zip']=scaler.fit_transform(data[['zip']])
data['city_pop']=scaler.fit_transform(data[['city_pop']])
data['cc_num']=encoder.fit_transform(data['cc_num'])

In [None]:
X=data.drop('is_fraud',axis=1)
y=data['is_fraud']

# **Splittind Data Set**

This code splits the dataset into training (30%) and testing (70%) sets. Setting `random_state=0` ensures the split is the same each time, making results reproducible.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7,random_state=0)

# **Building the Model**

**DecisionTree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score , precision_score ,recall_score ,f1_score
# Train Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

# Print metrics for Decision Tree
print("Decision Tree")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt))
print("Recall:", recall_score(y_test, y_pred_dt))
print("F1 Score:", f1_score(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\n")


**Random Forest**

In [None]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Print metrics for Random Forest
print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\n")


**Gradient Boosting**

In [None]:
# Train Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

# Print metrics for Gradient Boosting
print("Gradient Boosting")
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Precision:", precision_score(y_test, y_pred_gb))
print("Recall:", recall_score(y_test, y_pred_gb))
print("F1 Score:", f1_score(y_test, y_pred_gb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("\n")


**Logistic Regression**

In [None]:
# Train Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Print metrics for Logistic Regression
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\n")


# **Visualizing the Results & Comparing the Models**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Function to plot confusion matrix
def plot_confusion_matrix(y_test, y_pred, title):
    plt.figure(figsize=(6, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix: {title}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Plotting confusion matrices
plot_confusion_matrix(y_test, y_pred_lr, 'Logistic Regression')
plot_confusion_matrix(y_test, y_pred_dt, 'Decision Tree')
plot_confusion_matrix(y_test, y_pred_rf, 'Random Forest')
plot_confusion_matrix(y_test, y_pred_gb, 'Gradient Boosting')


In [None]:
# Function to print and return metrics
def get_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return accuracy, precision, recall, f1, cm

# Dictionary to store all metrics
metrics_dict = {
    'Logistic Regression': get_metrics(y_test, y_pred_lr),
    'Decision Tree': get_metrics(y_test, y_pred_dt),
    'Random Forest': get_metrics(y_test, y_pred_rf),
    'Gradient Boosting': get_metrics(y_test, y_pred_gb)
}

# Display metrics
metrics_df = pd.DataFrame(metrics_dict, index=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'Confusion Matrix']).transpose()
metrics_df = metrics_df.drop(columns=['Confusion Matrix'])  # Drop confusion matrix for graphical representation
print(metrics_df)


# Plot the metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for ax, metric in zip(axes.flatten(), metrics):
    metrics_df[metric].plot(kind='bar', ax=ax)
    for i in range(len(metrics_df)):
        ax.text(i, metrics_df[metric][i] + 0.01, round(metrics_df[metric][i], 4), ha='center')
    ax.set_title(f'{metric} Scores by Model')
    ax.set_ylabel(metric)
    ax.set_ylim(0, 1)
    ax.set_xticks(range(len(metrics_df)))
    ax.set_xticklabels(metrics_df.index, rotation=45)

plt.tight_layout()
plt.show()


Based on your results, Gradient Boosting has proven to be the best model for your credit card fraud detection project. It achieved the highest accuracy at 0.946720, and its precision of 0.965330 ensures that legitimate transactions are rarely flagged as fraudulent. With a recall of 0.928712, it effectively identifies most fraudulent transactions, and the balanced F1 score of 0.946667 further confirms its overall robustness and reliability. These metrics indicate that Gradient Boosting is highly effective in detecting fraudulent transactions while minimizing false positives and negatives. Its ability to handle complex data patterns and reduce overfitting makes it the optimal choice for your project.

In [None]:

# Initial evaluation
def evaluate_model(y_test, y_pred, model_name):
    print(f"Metrics for {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")

evaluate_model(y_test, y_pred_gb, "Default Gradient Boosting")


# **Features That Helped in Model Prediction**

**Interpretation**

**High Importance Features:** These are the features that have the highest bars in the plot. They are the most influential in the model's decision-making process.

**Low Importance Features:** These features have the smallest bars and contributed the least to the model's predictions. Sometimes, you may decide to remove these features to simplify the model without losing much predictive power.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Get feature importances from the Gradient Boosting model
feature_importances = gb_model.feature_importances_
features = X.columns
indices = np.argsort(feature_importances)

# Plot the feature importances
plt.figure(figsize=(10, 8))
plt.title('Feature Importances: Gradient Boosting')
plt.barh(range(len(indices)), feature_importances[indices], align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()


# **Fine-Tuning and Improvements**

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1.0],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
# Get the best model from grid search
best_rf_model = grid_search.best_estimator_

# Predict using the optimized model
y_pred_best_rf = best_rf_model.predict(X_test)

# Print metrics for the optimized Random Forest
def print_metrics(y_test, y_pred, model_name):
    print(f"Metrics for {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")

print_metrics(y_test, y_pred_best_rf, "Optimized Random Forest")


# **Conclusion**

After thorough evaluation and comparison of the models, Gradient Boosting emerges as the best model for your credit card fraud detection project. Initially, Gradient Boosting showcased superior performance with high accuracy (0.944389), precision (0.962636), recall (0.92675), and F1 score (0.944352). Despite further hyperparameter optimization using Grid Search, this model maintained its strong performance, indicating robustness and effectiveness in handling complex data patterns and non-linear relationships crucial for fraud detection.

Although Random Forest also demonstrated excellent performance, especially after optimization, Gradient Boosting consistently outperformed it across key metrics, making it the optimal choice for minimizing both false positives and false negatives in fraud detection. Therefore, deploying Gradient Boosting will provide you with the most reliable and accurate results for detecting fraudulent transactions in your dataset.