In [4]:
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

In [5]:
models = {
    "Logistic Regression": LogisticRegression(multi_class='multinomial', solver='lbfgs'),
    "Random Forest": RandomForestClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "Support Vector Classifier": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

In [11]:
def fahrenheit_to_celsius(fahrenheit):
    return round(((fahrenheit - 32) * 5 / 9),2)

In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv("Maternal Health Risk Data Set.csv")
df['BodyTemp'] = df['BodyTemp'].apply(fahrenheit_to_celsius)
df.to_csv('updated_file.csv', index=False)
le_RiskLevel = LabelEncoder()

In [13]:
X = df[["Age","SystolicBP","DiastolicBP","BS","BodyTemp","HeartRate"]]
y = df["RiskLevel"]
df

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
0,25,130,80,15.0,36.67,86
1,35,140,90,13.0,36.67,70
2,29,90,70,8.0,37.78,80
3,30,140,85,7.0,36.67,70
4,35,120,60,6.1,36.67,76
...,...,...,...,...,...,...
1009,22,120,60,15.0,36.67,80
1010,55,120,90,18.0,36.67,60
1011,35,85,60,19.0,36.67,86
1012,43,120,90,18.0,36.67,70


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [15]:
Best_Model = None
best_accuracy = 0
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multiclass
    print(f"{name} - Accuracy: {(accuracy * 100):.2f}% | Precision: {(precision * 100):.2f}%")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        Best_Model = model
print(f"\nBest Model: {Best_Model.__class__.__name__} with Accuracy: {best_accuracy:.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression - Accuracy: 61.58% | Precision: 65.57%
Random Forest - Accuracy: 81.28% | Precision: 81.85%
GradientBoostingClassifier - Accuracy: 73.89% | Precision: 73.82%
Support Vector Classifier - Accuracy: 58.62% | Precision: 65.07%
K-Nearest Neighbors - Accuracy: 68.47% | Precision: 68.75%
Naive Bayes - Accuracy: 57.64% | Precision: 59.79%

Best Model: RandomForestClassifier with Accuracy: 0.8128


In [16]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for the best-performing model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize GridSearchCV with the best model and the parameter grid
grid_search = GridSearchCV(estimator=Best_Model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the model using grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best accuracy from grid search
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

print(f"Best Parameters from Grid Search: {best_params}")
print(f"Best Cross-Validated Accuracy: {best_accuracy:.4f}")

# Use the best estimator (model with the best parameters) to make predictions
best_model_grid_search = grid_search.best_estimator_
y_pred = best_model_grid_search.predict(X_test)

# Evaluate the accuracy and precision of the best model after Grid Search
final_accuracy = accuracy_score(y_test, y_pred)
final_precision = precision_score(y_test, y_pred, average='weighted')

print(f"\nFinal Accuracy after Grid Search: {final_accuracy:.4f}")
print(f"Final Precision after Grid Search: {final_precision:.4f}")


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters from Grid Search: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Cross-Validated Accuracy: 0.8336

Final Accuracy after Grid Search: 0.8177
Final Precision after Grid Search: 0.8213


In [108]:
import pickle
data = {"model":best_model_grid_search}
with open("Grid_Model.pkl","wb") as file:
    pickle.dump(data,file) 

In [109]:
with open("Classification.pkl","rb") as file:
    data = pickle.load(file)
regressor_loaded = data["model"]

In [110]:
X = np.array([[12,95,60,6.1,102,60]])
y_pred = regressor_loaded.predict(X)
y_pred , best_model_grid_search.predict(X)



(array(['high risk'], dtype=object), array(['low risk'], dtype=object))

In [17]:
import joblib
# Save the model and label encoder
with open('model.pkl', 'wb') as file:
    joblib.dump({
        'model': best_model_grid_search,
        'le_RiskLevel': le_RiskLevel
    }, file)

In [18]:
with open('1.pkl', 'rb') as file:
    data = joblib.load(file)
best_model_grid_search = data['model']
le_RiskLevel = data['le_RiskLevel']
X = np.array([[12,95,60,6.1,102,60]])
best_model_grid_search.predict(X)



array(['low risk'], dtype=object)