In [1]:
!python --version

Python 3.11.5


# Overview

The purpose of this notebook is to build a heart disease classification model

## Importing the required libaries

In [2]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

import pickle

## Loading the data

In [3]:
df = pd.read_csv('card_transdata.csv')

## Data Preparation

In [4]:
df = df.iloc[:1000, :]

X = df.drop(['fraud'], axis=1)
y = df['fraud']

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp) 

X_train.shape, X_test.shape, X_val.shape, y_test.shape, y_val.shape

((700, 7), (150, 7), (150, 7), (150,), (150,))

In [5]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [6]:
accuracy_score(y_test, y_pred)

0.98

In [7]:
confusion_matrix(y_test, y_pred)

array([[137,   1],
       [  2,  10]], dtype=int64)

## Save and Load, scaler and model

In [8]:
# Save Scaler
scaler_name = 'scaler.sav'
pickle.dump(scaler, open(scaler_name, 'wb'))

# Save model
filename = 'prediction_model.sav'
pickle.dump(rf, open(filename, 'wb'))

In [9]:
# Load Scaler
loaded_scaler = pickle.load(open(scaler_name, 'rb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [10]:
# Testing model on a single column
sample = df.iloc[13:14, :-1]

scale_sample = loaded_scaler.transform(sample)
loaded_model.predict(scale_sample)

array([1.])

In [13]:
df.iloc[:1]

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0


In [14]:
pickle.format_version

'4.0'