In [20]:
# model training

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('dataset/pd.csv')

# Split the data into features and target
X = data.drop('Result', axis=1)
y = data['Result']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict the test set results
y_pred = rf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Accuracy: 0.9990922294843864


In [2]:
# evaluate trained RandomForestClassifier model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the performance of the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test, y_pred))


Confusion Matrix:
 [[2785    3]
 [   2 2718]]

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2788
         1.0       1.00      1.00      1.00      2720

    accuracy                           1.00      5508
   macro avg       1.00      1.00      1.00      5508
weighted avg       1.00      1.00      1.00      5508


Accuracy: 0.9990922294843864


In [3]:
# Hyperparameter Tuning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd


# Split the data into input features and target variable
X = data.drop('Result', axis=1)
y = data['Result']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a random forest classifier object
rf = RandomForestClassifier()

# Create a grid search object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best Score: 0.9975944369442376


In [4]:
# save trained model
from sklearn.ensemble import RandomForestClassifier
import joblib

# Create and train a RandomForestClassifier model
model = RandomForestClassifier()
X_train = data[['Age of the patient', 'Gender of the patient', 'Total Bilirubin',
              'Direct Bilirubin', 'Alkphos Alkaline Phosphotase', 
              'Sgpt Alamine Aminotransferase', 'Sgot Aspartate Aminotransferase',
              'Total Protiens', 'ALB Albumin', 'A/G Ratio Albumin and Globulin Ratio']].values  # Training data
y_train = data['Result'].values  # Target labels
model.fit(X_train, y_train)

# Save the trained model to a .pkl file
filename = 'your_model.pkl'
joblib.dump(model, filename)


['your_model.pkl']

In [5]:
# serialized version of a Scikit-Learn scaler object that you have trained on your data.
from sklearn.preprocessing import StandardScaler
import joblib

# Load the data into a Pandas DataFrame
df = pd.read_csv('dataset/pd.csv')

# Extract the feature columns
X_train = df[['Age of the patient', 'Gender of the patient', 'Total Bilirubin',
              'Direct Bilirubin', 'Alkphos Alkaline Phosphotase', 
              'Sgpt Alamine Aminotransferase', 'Sgot Aspartate Aminotransferase',
              'Total Protiens', 'ALB Albumin', 'A/G Ratio Albumin and Globulin Ratio']].values

# Create a StandardScaler object and fit it to the training data
scaler = StandardScaler()
scaler.fit(X_train)

# Save the scaler object to a file
joblib.dump(scaler, 'your_scaler.pkl')


['your_scaler.pkl']

In [21]:
import joblib
import pandas as pd

def predict(data_df):
    # Load the trained model and scaler
    model = joblib.load('your_model.pkl')
    scaler = joblib.load('your_scaler.pkl')
    # Scale the input data using the scaler
    scaled_data = scaler.transform(data_df)

    # Make predictions using the trained model
    predictions = model.predict(scaled_data)
    return predictions

In [22]:
print(X_test.columns.to_list)

<bound method IndexOpsMixin.tolist of Index(['Age of the patient', 'Gender of the patient', 'Total Bilirubin',
       'Direct Bilirubin', 'Alkphos Alkaline Phosphotase',
       'Sgpt Alamine Aminotransferase', 'Sgot Aspartate Aminotransferase',
       'Total Protiens', 'ALB Albumin',
       'A/G Ratio Albumin and Globulin Ratio'],
      dtype='object')>


In [23]:
def create_df(age, gender, tb, db, aap, sgpt, sgot, tp, alb, ratio):
    df = pd.DataFrame({
        'Age of the patient': [age],
        'Gender of the patient': [gender],
        'Total Bilirubin': [tb],
        'Direct Bilirubin': [db],
        'Alkphos Alkaline Phosphotase': [aap],
        'Sgpt Alamine Aminotransferase': [sgpt],
        'Sgot Aspartate Aminotransferase': [sgot],  
        'Total Protiens': [tp],
        'ALB Albumin': [alb],
        'A/G Ratio Albumin and Globulin Ratio': [ratio]
    })
    return df



In [28]:
y


0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
27535    1.0
27536    1.0
27537    1.0
27538    1.0
27539    1.0
Name: Result, Length: 27540, dtype: float64

In [40]:
print(model.predict(X.loc[27537].values.reshape(1, -1)))


[1.]


In [38]:
X.loc[0]

Age of the patient                      0.709302
Gender of the patient                   0.000000
Total Bilirubin                         0.004021
Direct Bilirubin                        0.000000
Alkphos Alkaline Phosphotase            0.060576
Sgpt Alamine Aminotransferase           0.003015
Sgot Aspartate Aminotransferase         0.001626
Total Protiens                          0.594203
ALB Albumin                             0.521739
A/G Ratio Albumin and Globulin Ratio    0.240000
Name: 0, dtype: float64

In [39]:
X.loc[27537]

Age of the patient                      0.488372
Gender of the patient                   0.500000
Total Bilirubin                         0.016086
Direct Bilirubin                        0.035714
Alkphos Alkaline Phosphotase            0.081583
Sgpt Alamine Aminotransferase           0.039196
Sgot Aspartate Aminotransferase         0.013011
Total Protiens                          0.768116
ALB Albumin                             0.673913
A/G Ratio Albumin and Globulin Ratio    0.280000
Name: 27537, dtype: float64

In [13]:
X.head()

Unnamed: 0,Age of the patient,Gender of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio
0,0.709302,0.0,0.004021,0.0,0.060576,0.003015,0.001626,0.594203,0.521739,0.24
1,0.674419,0.5,0.140751,0.27551,0.310699,0.027136,0.018296,0.695652,0.5,0.176
2,0.674419,0.5,0.092493,0.204082,0.208598,0.025126,0.011791,0.623188,0.521739,0.236
3,0.627907,0.5,0.008043,0.015306,0.058134,0.00201,0.002033,0.594203,0.543478,0.28
4,0.790698,0.5,0.046917,0.096939,0.064485,0.008543,0.009961,0.666667,0.326087,0.04


In [14]:
X

Unnamed: 0,Age of the patient,Gender of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio
0,0.709302,0.0,0.004021,0.000000,0.060576,0.003015,0.001626,0.594203,0.521739,0.240
1,0.674419,0.5,0.140751,0.275510,0.310699,0.027136,0.018296,0.695652,0.500000,0.176
2,0.674419,0.5,0.092493,0.204082,0.208598,0.025126,0.011791,0.623188,0.521739,0.236
3,0.627907,0.5,0.008043,0.015306,0.058134,0.002010,0.002033,0.594203,0.543478,0.280
4,0.790698,0.5,0.046917,0.096939,0.064485,0.008543,0.009961,0.666667,0.326087,0.040
...,...,...,...,...,...,...,...,...,...,...
27535,0.418605,0.5,0.006702,0.005102,0.025892,0.013065,0.001220,0.507246,0.500000,0.280
27536,0.476744,0.5,0.012064,0.015306,0.069858,0.010050,0.003049,0.478261,0.478261,0.304
27537,0.488372,0.5,0.016086,0.035714,0.081583,0.039196,0.013011,0.768116,0.673913,0.280
27538,0.313953,0.0,0.021448,0.051020,0.079629,0.011558,0.025412,0.478261,0.391304,0.200


In [16]:
y.unique()

array([0., 1.])

In [17]:
y.value_counts()

0.0    13770
1.0    13770
Name: Result, dtype: int64

In [18]:
X

Unnamed: 0,Age of the patient,Gender of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio
0,0.709302,0.0,0.004021,0.000000,0.060576,0.003015,0.001626,0.594203,0.521739,0.240
1,0.674419,0.5,0.140751,0.275510,0.310699,0.027136,0.018296,0.695652,0.500000,0.176
2,0.674419,0.5,0.092493,0.204082,0.208598,0.025126,0.011791,0.623188,0.521739,0.236
3,0.627907,0.5,0.008043,0.015306,0.058134,0.002010,0.002033,0.594203,0.543478,0.280
4,0.790698,0.5,0.046917,0.096939,0.064485,0.008543,0.009961,0.666667,0.326087,0.040
...,...,...,...,...,...,...,...,...,...,...
27535,0.418605,0.5,0.006702,0.005102,0.025892,0.013065,0.001220,0.507246,0.500000,0.280
27536,0.476744,0.5,0.012064,0.015306,0.069858,0.010050,0.003049,0.478261,0.478261,0.304
27537,0.488372,0.5,0.016086,0.035714,0.081583,0.039196,0.013011,0.768116,0.673913,0.280
27538,0.313953,0.0,0.021448,0.051020,0.079629,0.011558,0.025412,0.478261,0.391304,0.200
