In [None]:
import pandas as pd

# Load the dataset
file_path = 'creditcard.csv'
data = pd.read_csv(file_path)

# Explore the dataset
print(data.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

# Check for missing values
print(data.isnull().sum())

# Standardize the 'Amount' and 'Time' columns
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = scaler.fit_transform(data['Time'].values.reshape(-1, 1))

# Address class imbalance by upsampling the minority class
fraud = data[data['Class'] == 1]
legit = data[data['Class'] == 0]

fraud_upsampled = resample(fraud,
                           replace=True,  # sample with replacement
                           n_samples=len(legit),  # match number in majority class
                           random_state=42)  # reproducible results

data_upsampled = pd.concat([legit, fraud_upsampled])

# Shuffle the dataset
data_upsampled = data_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate features and target
X = data_upsampled.drop('Class', axis=1)
y = data_upsampled['Class']

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Train Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Train Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Train Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"F1 Score: {f1_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

print("Logistic Regression:")
evaluate_model(lr_model, X_test, y_test)

print("\nDecision Tree:")
evaluate_model(dt_model, X_test, y_test)

print("\nRandom Forest:")
evaluate_model(rf_model, X_test, y_test)

Logistic Regression:
Accuracy: 0.9510753917310026
Precision: 0.9763966558811977
Recall: 0.9238678393827313
F1 Score: 0.949406223289141
              precision    recall  f1-score   support

           0       0.93      0.98      0.95     57219
           1       0.98      0.92      0.95     56507

    accuracy                           0.95    113726
   macro avg       0.95      0.95      0.95    113726
weighted avg       0.95      0.95      0.95    113726


Decision Tree:
Accuracy: 0.9997537942071294
Precision: 0.9995047315822058
Recall: 1.0
F1 Score: 0.9997523044532121
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     57219
           1       1.00      1.00      1.00     56507

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726


Random Forest:
Accuracy: 0.9999736208079067
Precision: 0.9999469120509644
Recall: 1.0
F1 Score

In [None]:
# -*- coding: utf-8 -*-
"""Task2.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1VDcEQoCJH4iGoRwRUXMmYdwec1J4bhU3
"""

import pandas as pd

# Load the dataset
file_path = 'creditcard.csv'
data = pd.read_csv(file_path)

# Explore the dataset
print(data.head())

from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

# Check for missing values
print(data.isnull().sum())

# Standardize the 'Amount' and 'Time' columns
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = scaler.fit_transform(data['Time'].values.reshape(-1, 1))

# Address class imbalance by upsampling the minority class
fraud = data[data['Class'] == 1]
legit = data[data['Class'] == 0]

fraud_upsampled = resample(fraud,
                           replace=True,  # sample with replacement
                           n_samples=len(legit),  # match number in majority class
                           random_state=42)  # reproducible results

data_upsampled = pd.concat([legit, fraud_upsampled])

# Shuffle the dataset
data_upsampled = data_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate features and target
X = data_upsampled.drop('Class', axis=1)
y = data_upsampled['Class']

from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Train Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Train Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Train Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"F1 Score: {f1_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

print("Logistic Regression:")
evaluate_model(lr_model, X_test, y_test)

print("\nDecision Tree:")
evaluate_model(dt_model, X_test, y_test)

print("\nRandom Forest:")
evaluate_model(rf_model, X_test, y_test)

   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression:
Accuracy: 0.996220869200084
Precision: 0.9924433249370277
Recall: 1.0
F1 Score: 0.9962073324905183
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      2399
         1.0       0.99      1.00      1.00      2364

    accuracy                           1.00      4763
   macro avg       1.00      1.00      1.00      4763
weighted avg       1.00      1.00      1.00      4763


Decision Tree:
Accuracy: 0.9995800965777871
Precision: 0.9991546914623838
Recall: 1.0
F1 Score: 0.9995771670190275
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2399
         1.0       1.00      1.00      1.00      2364

    accuracy                           1.00      4763
   macro avg       1.00      1.00      1.00      4763
weighted avg       1.00      1.00      1.00      4763


Random Forest:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
              precision    recall  f1-s