# Modeling

In [1]:
# importing relevant libraries
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [35]:
# importing machine learning modules
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

Loading the data


In [36]:
# importing the data(train and test)
training_data = pd.read_csv('training_data.csv')

test_data = pd.read_csv('test_data.csv')

In [37]:
training_data = training_data.drop('Unnamed: 0', axis=1)

test_data = test_data.drop('Unnamed: 0', axis=1)

In [38]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59257 entries, 0 to 59256
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   amount_tsh           59257 non-null  float64
 1   days_since_recorded  59257 non-null  int64  
 2   funder               59257 non-null  object 
 3   gps_height           59257 non-null  int64  
 4   installer            59257 non-null  object 
 5   basin                59257 non-null  object 
 6   subvillage           59257 non-null  object 
 7   population           59257 non-null  int64  
 8   public_meeting       59257 non-null  bool   
 9   scheme_management    59257 non-null  object 
 10  permit               59257 non-null  bool   
 11  construction_year    59257 non-null  int64  
 12  extraction_type      59257 non-null  object 
 13  management_group     59257 non-null  object 
 14  payment_type         59257 non-null  object 
 15  water_quality        59257 non-null 

In [39]:
# checking the first five 
training_data.head()

Unnamed: 0,amount_tsh,days_since_recorded,funder,gps_height,installer,basin,subvillage,population,public_meeting,scheme_management,...,construction_year,extraction_type,management_group,payment_type,water_quality,quantity_group,source_type,source_class,waterpoint_type,status_group
0,6000.0,995,Roman,1390,Roman,Lake Nyasa,Mnyusi B,109,True,VWC,...,1999,gravity,user-group,annually,soft,enough,spring,groundwater,communal standpipe,functional
1,0.0,272,Grumeti,1399,GRUMETI,Lake Victoria,Nyamara,280,True,Other,...,2010,gravity,user-group,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe,functional
2,25.0,281,Lottery Club,686,World vision,Pangani,Majengo,250,True,VWC,...,2009,gravity,user-group,per bucket,soft,enough,dam,surface,communal standpipe multiple,functional
3,0.0,309,Unicef,263,UNICEF,Ruvuma / Southern Coast,Mahakamani,58,True,VWC,...,1986,submersible,user-group,never pay,soft,dry,borehole,groundwater,communal standpipe multiple,non functional
4,0.0,874,Action In A,0,Artisan,Lake Victoria,Kyanyamisa,0,True,VWC,...,0,gravity,other,never pay,soft,seasonal,rainwater harvesting,surface,communal standpipe,functional


In [40]:
test_data.head()

Unnamed: 0,amount_tsh,days_since_recorded,funder,gps_height,installer,basin,subvillage,population,public_meeting,scheme_management,...,construction_year,extraction_type,payment_type,water_quality,quantity_group,source_type,source_class,waterpoint_type,waterpoint_type_group,month
0,0.0,302,Dmdd,1996,DMDD,Internal,Magoma,321,True,Parastatal,...,2012,other,never pay,soft,seasonal,rainwater harvesting,surface,other,other,2
1,0.0,302,Government Of Tanzania,1569,DWE,Pangani,Kimnyak,300,True,VWC,...,2000,gravity,never pay,soft,insufficient,spring,groundwater,communal standpipe,communal standpipe,2
2,0.0,305,Government Of Tanzania,1567,DWE,Internal,Msatu,500,True,VWC,...,2010,other,never pay,soft,insufficient,rainwater harvesting,surface,other,other,2
3,0.0,315,Finn Water,267,FINN WATER,Ruvuma / Southern Coast,Kipindimbi,250,True,VWC,...,1987,other,unknown,soft,dry,shallow well,groundwater,other,other,1
4,500.0,251,Bruder,1260,BRUDER,Ruvuma / Southern Coast,Losonga,60,True,Water Board,...,2000,gravity,monthly,soft,enough,spring,groundwater,communal standpipe,communal standpipe,3


### I'll be following this metrc for modeling



1. Pre-processing:
* Transform the categorical data 
* One-hot encode the categorical variables, such as funder, installer, basin, etc. to handle non-numeric data.
* Split the data into training and testing sets.

In [9]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59257 entries, 0 to 59256
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   amount_tsh           59257 non-null  float64
 1   days_since_recorded  59257 non-null  int64  
 2   funder               59257 non-null  object 
 3   gps_height           59257 non-null  int64  
 4   installer            59257 non-null  object 
 5   basin                59257 non-null  object 
 6   subvillage           59257 non-null  object 
 7   population           59257 non-null  int64  
 8   public_meeting       59257 non-null  bool   
 9   scheme_management    59257 non-null  object 
 10  permit               59257 non-null  bool   
 11  construction_year    59257 non-null  int64  
 12  extraction_type      59257 non-null  object 
 13  management_group     59257 non-null  object 
 14  payment_type         59257 non-null  object 
 15  water_quality        59257 non-null 

In [10]:
# Transform the categorical variables into numerical values using one-hot encoding
# encoder = OneHotEncoder()
# encoded_categories = encoder.fit_transform(training_data[['funder', 'installer', 'basin', 'subvillage',,'scheme_management''extraction_type', 'management_group ','payment_type', 'water_quality', 'quantity_group', 'source_type', 'source_class', 'waterpoint_type']])
# encoded_df = pd.DataFrame(encoded_categories.toarray(), columns=encoder.get_feature_names())
# training_data = pd.concat([training_data.drop(['funder', 'installer', 'basin', 'subvillage','scheme_management','extraction_type','management_group ', 'payment_type', 'water_quality', 'quantity_group', 'source_type', 'source_class', 'waterpoint_type'], axis=1), encoded_df], axis=1)


In [11]:
# # Convert the True/False values to binary (1/0)
# training_data['public_meeting'] = training_data['public_meeting'].astype(int)
# training_data['permit'] = training_data['permit'].astype(int)

# # Normalize the numerical features using StandardScaler
# scaler = StandardScaler()
# numerical_features = ['amount_tsh', 'days_since_recorded', 'gps_height', 'population']
# training_data[numerical_features] = scaler.fit_transform(training_data[numerical_features])

In [12]:
# training_data['amount_tsh'] = training_data['amount_tsh'].astype(int)
# # one-hot encoding

# training_data = pd.get_dummies(training_data, columns=['funder', 'installer', 'basin', 'subvillage', 'public_meeting',
#                                      'scheme_management', 'permit', 'extraction_type', 'payment_type',
#                                      'water_quality','management_group', 'quantity_group', 'source_type', 'source_class',
#                                      'waterpoint_type', ])

# # normalize numerical columns
# scaler = StandardScaler()
# training_data[['amount_tsh', 'days_since_recorded', 'gps_height', 'population', 'construction_year']] = scaler.fit_transform(training_data[['amount_tsh', 'days_since_recorded', 'gps_height', 'population', 'construction_year']])


In [68]:
#encode labels
# changing our data from funtional ....
le=LabelEncoder()
training_data.status_group = le.fit_transform(training_data.status_group)
training_data

Unnamed: 0,amount_tsh,days_since_recorded,funder,gps_height,installer,basin,subvillage,population,public_meeting,scheme_management,...,construction_year,extraction_type,management_group,payment_type,water_quality,quantity_group,source_type,source_class,waterpoint_type,status_group
0,6000.0,995,Roman,1390,Roman,Lake Nyasa,Mnyusi B,109,True,VWC,...,1999,gravity,user-group,annually,soft,enough,spring,groundwater,communal standpipe,0
1,0.0,272,Grumeti,1399,GRUMETI,Lake Victoria,Nyamara,280,True,Other,...,2010,gravity,user-group,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe,0
2,25.0,281,Lottery Club,686,World vision,Pangani,Majengo,250,True,VWC,...,2009,gravity,user-group,per bucket,soft,enough,dam,surface,communal standpipe multiple,0
3,0.0,309,Unicef,263,UNICEF,Ruvuma / Southern Coast,Mahakamani,58,True,VWC,...,1986,submersible,user-group,never pay,soft,dry,borehole,groundwater,communal standpipe multiple,2
4,0.0,874,Action In A,0,Artisan,Lake Victoria,Kyanyamisa,0,True,VWC,...,0,gravity,other,never pay,soft,seasonal,rainwater harvesting,surface,communal standpipe,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59252,10.0,214,Germany Republi,1210,CES,Pangani,Kiduruni,125,True,Water Board,...,1999,gravity,user-group,per bucket,soft,enough,spring,groundwater,communal standpipe,0
59253,4700.0,941,Cefa-njombe,1212,Cefa,Rufiji,Igumbilo,56,True,VWC,...,1996,gravity,user-group,annually,soft,enough,river/lake,surface,communal standpipe,0
59254,0.0,967,Government Of Tanzania,0,DWE,Rufiji,Madungulu,0,True,VWC,...,0,swn 80,user-group,monthly,fluoride,enough,borehole,groundwater,hand pump,0
59255,0.0,1001,Malec,0,Musa,Rufiji,Mwinyi,0,True,VWC,...,0,nira/tanira,user-group,never pay,soft,insufficient,shallow well,groundwater,hand pump,0


In [46]:
# Split data into features and target
X = training_data.drop(["status_group"], axis=1)
y = training_data["status_group"]

X_train, X_test, y_train, y_test = train_test_split(X, y ,random_state=42)


In [47]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((44442, 20), (14815, 20), (44442,), (14815,))

### categorical data

In [48]:
X_train_categorical = X_train.select_dtypes(exclude=["int64", "float64"]).copy()
X_train_categorical.head()

Unnamed: 0,funder,installer,basin,subvillage,public_meeting,scheme_management,permit,extraction_type,management_group,payment_type,water_quality,quantity_group,source_type,source_class,waterpoint_type
39364,Private Individual,Da,Wami / Ruvu,Shuleni,True,Company,True,mono,commercial,never pay,soft,enough,river/lake,surface,communal standpipe
23878,Wua,WU,Wami / Ruvu,Mkwajuni,True,WUA,True,ksb,user-group,per bucket,soft,enough,river/lake,surface,communal standpipe
20488,Hesawa,DWE,Lake Victoria,Kamagambo,True,VWC,True,gravity,other,never pay,soft,seasonal,rainwater harvesting,surface,communal standpipe
15598,He,HE,Lake Victoria,Kati,True,VWC,True,nira/tanira,user-group,never pay,soft,enough,shallow well,groundwater,hand pump
36335,Government Of Tanzania,DWE,Lake Nyasa,Matika,True,VWC,False,gravity,user-group,never pay,soft,enough,river/lake,surface,communal standpipe


In [49]:
#categotical data that I think might infulence the data
categorical_features = ["funder", "installer", "scheme_management",'management_group','water_quality','permit','public_meeting']
X_train_categorical = X_train[categorical_features].copy()
X_train_categorical

Unnamed: 0,funder,installer,scheme_management,management_group,water_quality,permit,public_meeting
39364,Private Individual,Da,Company,commercial,soft,True,True
23878,Wua,WU,WUA,user-group,soft,True,True
20488,Hesawa,DWE,VWC,other,soft,True,True
15598,He,HE,VWC,user-group,soft,True,True
36335,Government Of Tanzania,DWE,VWC,user-group,soft,False,True
...,...,...,...,...,...,...,...
54343,Oikos E.Afrika,Community,VWC,user-group,fluoride,True,True
38158,Ndrdp,DED,VWC,user-group,soft,False,True
860,Md,DW,VWC,user-group,soft,True,True
15795,Wfp/tnt/usaid,HAPA SINGIDA,Parastatal,parastatal,soft,True,True


In [50]:
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

ohe.fit(X_train_categorical)

X_train_ohe = pd.DataFrame(
    ohe.transform(X_train_categorical),
    index=X_train_categorical.index,
    columns=np.hstack(ohe.categories_)
)
X_train_ohe.head()

Unnamed: 0,A/co Germany,Aar,Abas Ka,Abasia,Abc-ihushi Development Cent,Abd,Abdala,Abddwe,Abood,Abs,...,fluoride abandoned,milky,salty,salty abandoned,soft,unknown,False,True,False.1,True.1
39364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
23878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
20488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
15598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
36335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


### numerical data

In [51]:
X_train_numerical = X_train.select_dtypes(exclude=["object",'bool']).copy()
X_train_numerical.head()

Unnamed: 0,amount_tsh,days_since_recorded,gps_height,population,construction_year
39364,0.0,1012,-3,30,2001
23878,30.0,996,182,200,2004
20488,0.0,873,0,0,0
15598,0.0,876,0,0,0
36335,0.0,995,1389,800,2000


In [52]:
X_test_numerical = X_test.select_dtypes(exclude=["object",'bool']).copy()
X_test_numerical.head()

Unnamed: 0,amount_tsh,days_since_recorded,gps_height,population,construction_year
8608,20.0,1009,248,40,2003
2349,1000.0,1005,2151,125,2000
43641,5000.0,270,902,76,2013
54981,8000.0,283,1468,0,1997
273,0.0,416,0,0,0


In [53]:
X_train_numerical.shape

(44442, 5)

In [54]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(X_train_numerical)
X_train_scaled = pd.DataFrame(
    scaler.transform(X_train_numerical),
    # index is important to ensure we can concatenate with other columns
    index=X_train_numerical.index,
    columns=X_train_numerical.columns
)
X_train_scaled

Unnamed: 0,amount_tsh,days_since_recorded,gps_height,population,construction_year
39364,0.000000,0.248771,0.021179,0.000984,0.994039
23878,0.000086,0.244838,0.086481,0.006557,0.995529
20488,0.000000,0.214602,0.022238,0.000000,0.000000
15598,0.000000,0.215339,0.022238,0.000000,0.000000
36335,0.000000,0.244592,0.512531,0.026230,0.993542
...,...,...,...,...,...
54343,0.017143,0.007375,0.542887,0.003148,0.997019
38158,0.000000,0.214356,0.022238,0.000000,0.000000
860,0.000000,0.211652,0.022238,0.000000,0.000000
15795,0.000000,0.095870,0.022238,0.000000,0.000000


In [55]:
#let's see how our numerical model does here

model = LogisticRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_train_numerical)
accuracy_score(y_train, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5423023266279645

In [66]:
X_train_full = pd.concat([X_train_scaled, X_train_ohe], axis=1)
X_train_full

Unnamed: 0,amount_tsh,days_since_recorded,gps_height,population,construction_year,A/co Germany,Aar,Abas Ka,Abasia,Abc-ihushi Development Cent,...,fluoride abandoned,milky,salty,salty abandoned,soft,unknown,False,True,False.1,True.1
39364,0.000000,0.248771,0.021179,0.000984,0.994039,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
23878,0.000086,0.244838,0.086481,0.006557,0.995529,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
20488,0.000000,0.214602,0.022238,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
15598,0.000000,0.215339,0.022238,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
36335,0.000000,0.244592,0.512531,0.026230,0.993542,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54343,0.017143,0.007375,0.542887,0.003148,0.997019,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
38158,0.000000,0.214356,0.022238,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
860,0.000000,0.211652,0.022238,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
15795,0.000000,0.095870,0.022238,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [None]:
y = X_train.select_dtypes(exclude=["int64", "float64"]).copy()


2. Logistic Regression:
* Choose logistic regression as the baseline model, since it's a simple and interpretable model.
* Fit the model on the training data and evaluate the performance on the testing data.
* Hyperparameters for logistic regression include the regularization term (e.g. L1 or L2) and the regularization strength (e.g. C).
* Grid search or random search can be used to find the best hyperparameters that result in the best performance on the testing data.


In [61]:
logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')
model_log = logreg.fit(X_train_full, y_train)
y_pred = model_log.predict(X_train_full)
accuracy_score(y_train, y_pred)

# Calculate the accuracy score
logreg_score = accuracy_score(y_train, y_pred)
print('Logistic Regression Accuracy Score:', logreg_score)



Logistic Regression Accuracy Score: 0.6861977408757481


In [70]:
y.shape

(59257,)

In [74]:
# Split data into features and target
X = training_data.drop(["status_group"], axis=1)
y = training_data["status_group"]

X_train, X_test, y_train, y_test = train_test_split(X, y ,random_state=42)


In [76]:
# Create a pipeline for preprocessing and modeling
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression())
])

# Define the parameters to search
param_grid = {
    "model__C": [0.1, 1, 10],
    "model__penalty": ["l1", "l2"]
}

# Create the Grid Search object
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# Fit the Grid Search to the data
grid_search.fit(X, y)

# Get the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

30 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ADMIN\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ADMIN\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\ADMIN\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\ADMIN\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File 

ValueError: could not convert string to float: 'Roman'

In [75]:
# Hyperparameter tuning
logistic_regression = LogisticRegression()
param_grid = {
    "C": [0.1, 1, 10, 100, 1000],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}
grid_search = GridSearchCV(logistic_regression, param_grid, cv=5)
grid_search.fit(X_train_full, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



KeyboardInterrupt: 

In [None]:
X_train.shape

In [None]:
# # Cross-validation
scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5)


In [None]:
# Feature selection
rfe = RFE(grid_search.best_estimator_, n_features_to_select=10)
rfe.fit(X, y)
selected_features = X.columns[rfe.get_support()]


In [None]:
# Use the selected features to build the final model
final_model = LogisticRegression(**grid_search.best_params_)
final_model.fit(X[selected_features], y)

3. Decision Trees:
* Since logistic regression may not capture complex relationships between features and the target variable, try a decision tree model.
* Choose the optimal hyperparameters using grid search or random search.
* Evaluate the performance on the testing data.
* Hyperparameters for decision trees include the maximum depth of the tree and the minimum number of samples required to split a node.



In [None]:
# train a decision tree model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# evaluate the model on the testing data
y_pred = dt.predict(X_test)


4. Random Forest:
* Random Forest is an extension of decision trees, where multiple trees are built and combined to make a prediction.
* Choose the optimal hyperparameters using grid search or random search.
* Evaluate the performance on the testing data.
* Hyperparameters for random forests include the number of trees in the forest, the maximum depth of each tree, and the minimum number of samples required to split a node.


5. Model Comparison:
* Compare the performance of the logistic regression, decision tree, and random forest models to choose the best one.
* Evaluate the performance using metrics such as accuracy, precision, recall, F1-score, AUC-ROC, etc.
