<a href="https://colab.research.google.com/github/Alikepallimahalakshmi/CODESOFT-TASKS/blob/main/CREDIT_CARD_FRAUD_DETECTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE  # For handling class imbalance

# Load the dataset
try:
    data = pd.read_csv('/content/creditcard.csv')
    print("Data loaded successfully!")
except FileNotFoundError:
    print("Error: 'creditcard.csv' not found. Please provide the correct file path.")
    data = None

if data is not None:
    # Check for NaN values in the 'Class' column
    print("\nNumber of NaN values in 'Class' column before handling:", data['Class'].isnull().sum())

    # Remove rows where 'Class' is NaN
    data_cleaned = data.dropna(subset=['Class'])

    # Check the number of NaN values again to confirm removal
    print("Number of NaN values in 'Class' column after handling:", data_cleaned['Class'].isnull().sum())

    # Separate features (X) and target (y) from the cleaned data
    X = data_cleaned.drop('Class', axis=1)
    y = data_cleaned['Class']

    # Explore class distribution
    print("\nClass Distribution after handling NaN:")
    print(y.value_counts(normalize=True))

    # Separate 'Time' and 'Amount' for scaling
    time_amount = X[['Time', 'Amount']]
    pca_features = X.drop(['Time', 'Amount'], axis=1)

    # Scale 'Time' and 'Amount'
    scaler = StandardScaler()
    scaled_time_amount = scaler.fit_transform(time_amount)
    scaled_time_amount_df = pd.DataFrame(scaled_time_amount, columns=['Time', 'Amount'])

    # Combine scaled features with PCA components
    X_scaled = pd.concat([scaled_time_amount_df, pca_features], axis=1)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

    print("\nTraining set shape:", X_train.shape)
    print("Testing set shape:", X_test.shape)

    # Handle class imbalance using SMOTE (Oversampling the minority class)
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    print("\nResampled training set shape:", X_train_resampled.shape)
    print("Resampled target distribution:")
    print(y_train_resampled.value_counts(normalize=True))

    # Train a Logistic Regression model
    model = LogisticRegression(solver='liblinear', random_state=42)
    model.fit(X_train_resampled, y_train_resampled)

    # Make predictions on the test set
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Evaluate the model
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\nF1-score:", f1_score(y_test, y_pred))
    print("AUC:", roc_auc_score(y_test, y_prob))

Data loaded successfully!

Number of NaN values in 'Class' column before handling: 1
Number of NaN values in 'Class' column after handling: 0

Class Distribution after handling NaN:
Class
0.0    0.997982
1.0    0.002018
Name: proportion, dtype: float64

Training set shape: (126245, 30)
Testing set shape: (54105, 30)

Resampled training set shape: (251980, 30)
Resampled target distribution:
Class
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99     53996
         1.0       0.08      0.93      0.15       109

    accuracy                           0.98     54105
   macro avg       0.54      0.95      0.57     54105
weighted avg       1.00      0.98      0.99     54105


F1-score: 0.1473377097009482
AUC: 0.9868194450013628
