In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
data = pd.read_csv('diabetes.csv')
data = data[(data["SkinThickness"] != 0) & (data["Insulin"] != 0) & (data["Glucose"] != 0)]
data['risk_function'] = data['Glucose'] * data['Glucose'] * data['BMI']

In [None]:
data.head(40)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,risk_function
3,1,89,66,23,94,28.1,0.167,21,0,222580.1
4,0,137,40,35,168,43.1,2.288,33,1,808943.9
6,3,78,50,32,88,31.0,0.248,26,1,188604.0
8,2,197,70,45,543,30.5,0.158,53,1,1183674.5
13,1,189,60,23,846,30.1,0.398,59,1,1075202.1
14,5,166,72,19,175,25.8,0.587,51,1,710944.8
16,0,118,84,47,230,45.8,0.551,31,1,637719.2
18,1,103,30,38,83,43.3,0.183,33,0,459369.7
19,1,115,70,30,96,34.6,0.529,32,1,457585.0
20,3,126,88,41,235,39.3,0.704,27,0,623926.8


In [None]:
X = data.drop(columns=['Outcome'])
y = data['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
regressor = DecisionTreeRegressor(random_state = 42)

In [None]:
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(mean_absolute_error(y_test,y_pred))

Mean Squared Error: 0.26582278481012656
0.26582278481012656


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 393 entries, 3 to 765
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               393 non-null    int64  
 1   Glucose                   393 non-null    int64  
 2   BloodPressure             393 non-null    int64  
 3   SkinThickness             393 non-null    int64  
 4   Insulin                   393 non-null    int64  
 5   BMI                       393 non-null    float64
 6   DiabetesPedigreeFunction  393 non-null    float64
 7   Age                       393 non-null    int64  
 8   Outcome                   393 non-null    int64  
 9   risk_function             393 non-null    float64
dtypes: float64(3), int64(7)
memory usage: 33.8 KB


In [None]:
xgb_model = XGBRegressor(n_estimators = 100, random_state= 42, learning_rate = 0.04)
xgb_model.fit(X_train,y_train)
xgb_preds= xgb_model.predict(X_test)
print(mean_squared_error(y_test, xgb_preds))

0.1703360378742218


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Train a Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=300, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
rf_preds = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_preds)

print(f"Random Forest Regressor - Mean Squared Error: {rf_mse}")

Random Forest Regressor - Mean Squared Error: 0.15412967651195503


In [None]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,risk_function
count,393.0,393.0,393.0,393.0,393.0,393.0,393.0,393.0,393.0,393.0
mean,3.292621,122.615776,70.64631,29.129771,155.885496,33.002036,0.52612,30.839695,0.330789,539314.6
std,3.211645,30.822276,12.484668,10.507575,118.738199,7.214395,0.350386,10.199903,0.471097,324073.7
min,0.0,56.0,24.0,7.0,14.0,0.0,0.085,21.0,0.0,0.0
25%,1.0,99.0,62.0,21.0,77.0,28.4,0.27,23.0,0.0,299520.0
50%,2.0,119.0,70.0,29.0,125.0,33.2,0.452,27.0,0.0,457537.5
75%,5.0,143.0,78.0,37.0,190.0,37.1,0.687,36.0,1.0,686201.6
max,17.0,198.0,110.0,63.0,846.0,67.1,2.42,81.0,1.0,1924560.0


In [None]:
rf_classifier = RandomForestClassifier(n_estimators=300, random_state=42)
rf_classifier.fit(X_train, y_train)

rf_preds = rf_classifier.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_preds)

In [None]:
print(f"Random Forest Classifier - Accuracy: {rf_accuracy * 100:.2f}%")
print(classification_report(y_test, rf_preds))

Random Forest Classifier - Accuracy: 78.48%
              precision    recall  f1-score   support

           0       0.79      0.92      0.85        53
           1       0.76      0.50      0.60        26

    accuracy                           0.78        79
   macro avg       0.78      0.71      0.73        79
weighted avg       0.78      0.78      0.77        79



In [None]:
xgb_classifier = XGBClassifier(n_estimators=100, learning_rate=0.04, random_state=42)
xgb_classifier.fit(X_train, y_train)

xgb_preds = xgb_classifier.predict(X_test)

xgb_accuracy = accuracy_score(y_test, xgb_preds)
print(f"XGBoost Classifier - Accuracy: {xgb_accuracy * 100:.2f}%")
print(classification_report(y_test, xgb_preds))

XGBoost Classifier - Accuracy: 75.95%
              precision    recall  f1-score   support

           0       0.77      0.92      0.84        53
           1       0.73      0.42      0.54        26

    accuracy                           0.76        79
   macro avg       0.75      0.67      0.69        79
weighted avg       0.75      0.76      0.74        79



In [None]:
X.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,risk_function
count,393.0,393.0,393.0,393.0,393.0,393.0,393.0,393.0,393.0
mean,3.292621,122.615776,70.64631,29.129771,155.885496,33.002036,0.52612,30.839695,539314.6
std,3.211645,30.822276,12.484668,10.507575,118.738199,7.214395,0.350386,10.199903,324073.7
min,0.0,56.0,24.0,7.0,14.0,0.0,0.085,21.0,0.0
25%,1.0,99.0,62.0,21.0,77.0,28.4,0.27,23.0,299520.0
50%,2.0,119.0,70.0,29.0,125.0,33.2,0.452,27.0,457537.5
75%,5.0,143.0,78.0,37.0,190.0,37.1,0.687,36.0,686201.6
max,17.0,198.0,110.0,63.0,846.0,67.1,2.42,81.0,1924560.0


In [None]:
final_model = rf_model