In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score


In [4]:
# Load the dataset into a DataFrame
data = pd.read_csv("cleaned_dataset 14k.csv")

# Drop rows with missing values
data = data.dropna()
data['Gender of the patient'] = np.where(data['Gender of the patient'] == 'Male', '1','0')

# Separate the features (X) and target variable (y)
X = data.drop("Result", axis=1)  # Adjust the column name if necessary
y = data["Result"]  # Adjust the column name if necessary

In [5]:
X.head(2)

Unnamed: 0,Age of the patient,Gender of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio
0,65.0,0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9
1,62.0,1,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74


In [43]:
data['A/G Ratio Albumin and Globulin Ratio'].max()

2.8

In [6]:
# Identify categorical and numerical columns
categorical_cols = ['Gender of the patient']  # Add other categorical columns if any
numerical_cols = ['Age of the patient', 'Total Bilirubin', 'Direct Bilirubin', 'Alkphos Alkaline Phosphotase', 'Sgpt Alamine Aminotransferase', 'Sgot Aspartate Aminotransferase', 'Total Protiens', 'ALB Albumin', 'A/G Ratio Albumin and Globulin Ratio']  # Add other numerical columns if any

# Create the column transformer for encoding and scaling
preprocessor = ColumnTransformer([
    ('encoder', OneHotEncoder(), categorical_cols),
])



In [7]:
# Replace values in a specific column using np.where()
data['Gender of the patient'] = np.where(data['Gender of the patient'] == 'Male', '1','0')

In [13]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the X dataset
X_transformed = scaler.fit_transform(X)

In [14]:
# Apply preprocessing on the feature columns

# Split the preprocessed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)


In [15]:
X.head(2)

Unnamed: 0,Age of the patient,Gender of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio
0,65.0,0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9
1,62.0,1,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74


In [16]:
X_test

array([[ 1.34478243,  0.6602966 ,  1.84249448, ..., -1.08821365,
        -1.17991315, -0.7631838 ],
       [ 0.98171472,  0.6602966 ,  0.52156113, ..., -2.27989552,
        -1.80971253, -0.7631838 ],
       [-1.3782254 , -1.51447091, -0.39664864, ...,  0.19513605,
        -0.17223414, -0.45374295],
       ...,
       [-0.10748841,  0.6602966 ,  0.1027286 , ...,  0.56180739,
         0.20564549, -0.45374295],
       [-0.83362383, -1.51447091, -0.41275758, ..., -0.17153529,
        -0.80203352, -1.07262466],
       [-1.19669154, -1.51447091,  0.55377901, ..., -0.07986746,
        -0.04627426, -0.14430209]])

In [20]:
# Initialize the Random Forest classifier
random_forest = RandomForestClassifier()

# Fit the classifier to the training data
random_forest.fit(X_train, y_train)

# Make predictions on the testing data
#predictions = random_forest.predict([[62.0,	1,	10.9,	5.5,	699.0,	64.0,	100.0	,7.5	,3.2	,0.74]])
predictions = random_forest.predict(X_test)
predictions[0]

1

In [21]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

Accuracy: 0.9993911719939117
Precision: 0.9991445680068435
Recall: 1.0
F1-score: 0.9995721009841677


In [26]:
import joblib
# Save the model to a file
joblib_file_path = "render.joblib"
joblib.dump(random_forest, joblib_file_path)

['render.joblib']

In [27]:
joblib_file_path = "render.joblib"

# Load the model from the joblib file
loaded_model = joblib.load(joblib_file_path)

# Use the loaded model for predictions or further processing
# For example:
predictions = loaded_model.predict(X_test)

In [28]:
predictions[0]

1