In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load your dataset 
diabetes_data = pd.read_csv('diabetes.csv')

# Preprocess the data
label_encoder = LabelEncoder()
categorical_columns = ['Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 
                       'Polyphagia', 'Genital thrush', 'visual blurring', 'Itching', 'Irritability',
                       'delayed healing', 'partial paresis', 'muscle stiffness', 'Alopecia', 'Obesity',
                       'class']

for col in categorical_columns:
    diabetes_data[col] = label_encoder.fit_transform(diabetes_data[col])

X = diabetes_data.drop('class', axis=1)
y = diabetes_data['class']

scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier with regularization parameters
rf_model = RandomForestClassifier(
    n_estimators=100,  # Number of trees in the forest
    max_depth=5,       # Maximum depth of each tree
    min_samples_split=5,  # Minimum number of samples required to split a node
    min_samples_leaf=2,   # Minimum number of samples required at each leaf node
    max_features='sqrt',  # Number of features to consider for the best split
    random_state=42
)

# Train the model
rf_model.fit(X_train, y_train)

# Evaluate the model
accuracy = rf_model.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9519230769230769


In [6]:
import joblib

# Save the trained model
joblib.dump(rf_model, 'rf_model.pkl')

# Save the label encoder and scaler
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']