# Explore here

In [150]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import uniform, norm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load Bank Marketing Campaign dataset from URL
url = "https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv"
df = pd.read_csv(url)
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [151]:
# Split the data into x and y 
X = df.drop('Outcome', axis=1) 
y = df['Outcome']  # The target variable (subscription outcome) 

In [152]:
X.shape

(768, 8)

In [153]:
y.shape 

(768,)

In [154]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=315)

In [155]:
# Split 80% of data is train data

print(f"Total rows: {df.shape[0]}")
print(f"Dimensions of X_train: {X_train.shape}")
print(f"Dimensions of y_train: {y_train.shape}")
print(f"Proportion of train set: {X_train.shape[0]} / {df.shape[0]} = {X_train.shape[0]/df.shape[0]:.2f} = {X_train.shape[0]/df.shape[0]*100:.0f}%")    

Total rows: 768
Dimensions of X_train: (614, 8)
Dimensions of y_train: (614,)
Proportion of train set: 614 / 768 = 0.80 = 80%


In [156]:
# Split 20% of data is train data

print(f"Total rows: {df.shape[0]}")
print(f"Dimensions of X_test: {X_test.shape}")
print(f"Dimensions of y_test: {y_test.shape}")
print(f"Proportion of test set: {X_test.shape[0]} / {df.shape[0]} = {X_test.shape[0]/df.shape[0]:.2f} = {X_test.shape[0]/df.shape[0]*100:.0f}%")

Total rows: 768
Dimensions of X_test: (154, 8)
Dimensions of y_test: (154,)
Proportion of test set: 154 / 768 = 0.20 = 20%


In [157]:
# Drop original categorical columns
X_train.drop(["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'], axis=1, inplace=True)

In [158]:
# Drop original categorical columns
X_test.drop(["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'], axis=1, inplace=True)


In [159]:
X_train.head(10)    
    

765
74
733
740
0
690
152
495
654
700


In [160]:
print(X_train.shape)
print(y_train.shape)

(614, 0)
(614,)


In [161]:
print(df.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [162]:
y = df['Outcome']  # Replace with the correct column name


In [163]:
print(X.dtypes)
print(y.dtypes) 

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
dtype: object
int64


In [164]:
y = y.astype('int')  # If numeric
    

In [165]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    

In [166]:
# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)


In [167]:
# Make predictions on the test set
y_pred = rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display classification report
print(classification_report(y_test, y_pred))
    

Accuracy: 0.7207792207792207
              precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154



In [168]:
# Define Random Forest with modified hyperparameters
rf = RandomForestClassifier(n_estimators=100,  # Number of trees in the forest
                             max_depth=10,    # Set the maximum depth of the tree
                             min_samples_split=5,  # Minimum samples required to split a node
                             random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
    

Accuracy: 0.7337662337662337
              precision    recall  f1-score   support

           0       0.80      0.79      0.79        99
           1       0.62      0.64      0.63        55

    accuracy                           0.73       154
   macro avg       0.71      0.71      0.71       154
weighted avg       0.73      0.73      0.73       154



In [None]:
# Define a range of values for max_depth and min_samples_split
max_depth_values = [5, 10, 15]
min_samples_split_values = [2, 5, 10]

for max_depth in max_depth_values:
    for min_samples_split in min_samples_split_values:
        rf = RandomForestClassifier(n_estimators=100,
                                     max_depth=max_depth,
                                     min_samples_split=min_samples_split,
                                     random_state=42)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"max_depth: {max_depth}, min_samples_split: {min_samples_split}, Accuracy: {accuracy}")

max_depth: 5, min_samples_split: 2, Accuracy: 0.7662337662337663
max_depth: 5, min_samples_split: 5, Accuracy: 0.7727272727272727
max_depth: 5, min_samples_split: 10, Accuracy: 0.7792207792207793
max_depth: 10, min_samples_split: 2, Accuracy: 0.7467532467532467
max_depth: 10, min_samples_split: 5, Accuracy: 0.7337662337662337
max_depth: 10, min_samples_split: 10, Accuracy: 0.7532467532467533
max_depth: 15, min_samples_split: 2, Accuracy: 0.7337662337662337
max_depth: 15, min_samples_split: 5, Accuracy: 0.7272727272727273
max_depth: 15, min_samples_split: 10, Accuracy: 0.7597402597402597
