<a href="https://colab.research.google.com/github/121013jesi/magazine/blob/main/NM_phase_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

Saving CreditCardData - Copy.csv to CreditCardData - Copy.csv


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv('/content/CreditCardData - Copy.csv')

# Data Preprocessing
# Check for missing values
print(df.isnull().sum())

# Handle missing values if any
# For simplicity, you can drop rows with missing values or fill them with a placeholder
df = df.dropna()

# Convert 'Amount' to numeric after removing the currency symbol
df['Amount'] = df['Amount'].replace('[£]', '', regex=True).astype(float)

# Convert 'Date' to datetime and extract day, month, and year
df['Date'] = pd.to_datetime(df['Date'], format='%d-%b-%y', errors='coerce')
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

# Drop the original 'Date' column
df = df.drop('Date', axis=1)

# Verify data types
print(df.dtypes)

# One-hot encode categorical columns
categorical_columns = ['Day of Week', 'Type of Card', 'Entry Mode', 'Type of Transaction', 'Merchant Group',
                       'Country of Transaction', 'Shipping Address', 'Country of Residence', 'Gender', 'Bank']

# Handle categorical encoding and check for consistent data types
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Verify there are no missing values after encoding
print(df.isnull().sum())

# Define the feature set and target variable
X = df.drop(['Transaction ID', 'Fraud'], axis=1)
y = df['Fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict the target variable on the test set
y_pred = clf.predict(X_test)

# Evaluate the model using accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')


Transaction ID             0
Date                       0
Day of Week                0
Time                       0
Type of Card               0
Entry Mode                 0
Amount                     6
Type of Transaction        0
Merchant Group            10
Country of Transaction     0
Shipping Address           5
Country of Residence       0
Gender                     4
Age                        0
Bank                       0
Fraud                      0
dtype: int64
Transaction ID             object
Day of Week                object
Time                        int64
Type of Card               object
Entry Mode                 object
Amount                    float64
Type of Transaction        object
Merchant Group             object
Country of Transaction     object
Shipping Address           object
Country of Residence       object
Gender                     object
Age                       float64
Bank                       object
Fraud                       int64
Day          

In [None]:
# Step 1: Import Libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# Step 2: Load and Preprocess Data
file_path = '/content/CreditCardData - Copy.csv'
credit_card_data = pd.read_csv(file_path)

# Fill missing values with the most frequent value (mode) of each column
for column in credit_card_data.columns:
    if credit_card_data[column].isnull().sum() > 0:
        credit_card_data[column].fillna(credit_card_data[column].mode()[0], inplace=True)

# Convert 'Amount' to a numerical value
credit_card_data['Amount'] = credit_card_data['Amount'].str.replace('£', '').astype(float)

# Convert categorical variables to numerical values using Label Encoding
label_encoders = {}
categorical_columns = ['Type of Card', 'Entry Mode', 'Type of Transaction', 'Merchant Group',
                       'Country of Transaction', 'Shipping Address', 'Country of Residence',
                       'Gender', 'Bank']

for column in categorical_columns:
    le = LabelEncoder()
    credit_card_data[column] = le.fit_transform(credit_card_data[column])
    label_encoders[column] = le

# Define features and target variable
X = credit_card_data.drop(columns=['Transaction ID', 'Date', 'Day of Week', 'Fraud'])
y = credit_card_data['Fraud']

# Step 3: Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Define Models and Train
def train_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    return accuracy, precision, recall, f1, roc_auc, conf_matrix, class_report

# Initialize the models
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

# Train and evaluate Logistic Regression
logistic_metrics = train_evaluate_model(logistic_model, X_train, y_train, X_test, y_test)

# Train and evaluate Random Forest
rf_metrics = train_evaluate_model(rf_model, X_train, y_train, X_test, y_test)

# Train and evaluate Gradient Boosting
gb_metrics = train_evaluate_model(gb_model, X_train, y_train, X_test, y_test)

# Step 5: Display Results
print("Logistic Regression Metrics:\n", logistic_metrics)
print("Random Forest Metrics:\n", rf_metrics)
print("Gradient Boosting Metrics:\n", gb_metrics)


Logistic Regression Metrics:
 (0.9569, 0.7881438289601554, 0.5573883161512028, 0.6529790660225443, 0.9490397556236849, array([[18327,   218],
       [  644,   811]]), '              precision    recall  f1-score   support\n\n           0       0.97      0.99      0.98     18545\n           1       0.79      0.56      0.65      1455\n\n    accuracy                           0.96     20000\n   macro avg       0.88      0.77      0.82     20000\nweighted avg       0.95      0.96      0.95     20000\n')
Random Forest Metrics:
 (0.98775, 0.9786392405063291, 0.8501718213058419, 0.9098933431408607, 0.9941420840363229, array([[18518,    27],
       [  218,  1237]]), '              precision    recall  f1-score   support\n\n           0       0.99      1.00      0.99     18545\n           1       0.98      0.85      0.91      1455\n\n    accuracy                           0.99     20000\n   macro avg       0.98      0.92      0.95     20000\nweighted avg       0.99      0.99      0.99     20000