In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

file_path = '/content/drive/MyDrive/creditcard.csv'
df = pd.read_csv(file_path)

In [3]:
df.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# Calculate basic summary statistics for numerical columns

In [4]:
df[['Time', 'Amount', 'Class']].describe()

Unnamed: 0,Time,Amount,Class
count,284807.0,284807.0,284807.0
mean,94813.859575,88.349619,0.001727
std,47488.145955,250.120109,0.041527
min,0.0,0.0,0.0
25%,54201.5,5.6,0.0
50%,84692.0,22.0,0.0
75%,139320.5,77.165,0.0
max,172792.0,25691.16,1.0


#Check for missing values in the dataframe

In [5]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

# Check the dimensions of the dataframe

In [6]:
print("Number of rows:", len(df))
print("Number of columns:", len(df.columns))

Number of rows: 284807
Number of columns: 31


# Separate features (X) and target variable (y)

In [10]:
X = df.drop(columns=['Class', 'Time'])
y = df['Class']

# Normalize the features using StandardScaler

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the dataset into training and testing sets (70% train, 30% test)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance using oversampling

In [13]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train a Logistic Regression model

In [14]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

# Predict on the test set

In [15]:
y_predict =  model.predict(X_test)

# Evaluate the model's performance

In [18]:
from sklearn.metrics import confusion_matrix, classification_report

confusion = confusion_matrix(y_test, y_predict)
print(f"Confusion Matrix:\n", confusion)

Confusion Matrix:
 [[55380  1484]
 [    8    90]]


In [19]:
report = classification_report(y_test, y_predict)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.99     56962



# Calculate precision, recall, and F1-score

In [20]:
precision = confusion[1, 1] / (confusion[0, 1] + confusion[1, 1])
recall = confusion[1, 1] / (confusion[1, 0] + confusion[1, 1])
f1_score = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Precision: 0.05717916137229987
Recall: 0.9183673469387755
F1-Score: 0.10765550239234449


In [25]:
new_data = {
    'V1': [0.1, 2.0, -1.5],
    'V2': [1.2, 0.5, -0.8],
    'V3': [-0.5, 1.0, -2.3],
    'V4': [1.5, -0.7, 0.9],
    'V5': [0.8, -1.2, 1.7],
    'V6': [-0.3, 0.6, -1.0],
    'V7': [0.7, -1.5, 1.2],
    'V8': [-0.9, 0.4, -1.1],
    'V9': [0.2, -1.3, 0.6],
    'V10': [-0.3, 1.5, -0.7],
    'V11': [-0.8, 0.9, -1.2],
    'V12': [1.0, -1.1, 0.8],
    'V13': [0.5, -0.4, 1.3],
    'V14': [-1.2, 0.7, -0.9],
    'V15': [0.6, -1.0, 1.1],
    'V16': [-1.0, 0.8, -0.6],
    'V17': [0.9, -0.5, 1.4],
    'V18': [-0.7, 1.2, -0.3],
    'V19': [1.1, -1.4, 0.4],
    'V20': [-0.4, 0.3, -0.5],
    'V21': [0.3, -0.6, 0.7],
    'V22': [-0.2, 0.1, -0.2],
    'V23': [0.8, -1.1, 0.9],
    'V24': [-0.9, 0.2, -0.8],
    'V25': [0.4, -0.3, 0.5],
    'V26': [-0.6, 0.7, -0.4],
    'V27': [0.1, -0.2, 0.3],
    'V28': [0.5, -0.4, 0.6],
    'Amount': [10.0, 20.0, 5.0],
}

new_df = pd.DataFrame(new_data)

In [26]:
new_X = scaler.transform(new_df)

In [27]:
new_predictions = model.predict(new_X)

In [28]:
print("Prediction on new Data: ", new_predictions)

Prediction on new Data:  [1 0 0]
