# Diabetes Prediction Model Training

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

## 2. Load Data

In [None]:
df = pd.read_csv('data/diabetes.csv')
df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
df.info()
df.describe()

In [None]:
sns.countplot(x='Outcome', data=df)
plt.title('Distribution of Outcome')
plt.show()

In [None]:
df.hist(bins=10, figsize=(10,10))
plt.show()

## 4. Data Preprocessing

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 5. Model Training

### 5.1 Logistic Regression

In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

### 5.2 Random Forest

In [None]:
rand_forest = RandomForestClassifier(n_estimators=100, random_state=42)
rand_forest.fit(X_train, y_train)

## 6. Model Evaluation

### 6.1 Logistic Regression

In [None]:
y_pred_log_reg = log_reg.predict(X_test)
print('Logistic Regression Accuracy:', accuracy_score(y_test, y_pred_log_reg))
print(confusion_matrix(y_test, y_pred_log_reg))
print(classification_report(y_test, y_pred_log_reg))

### 6.2 Random Forest

In [None]:
y_pred_rand_forest = rand_forest.predict(X_test)
print('Random Forest Accuracy:', accuracy_score(y_test, y_pred_rand_forest))
print(confusion_matrix(y_test, y_pred_rand_forest))
print(classification_report(y_test, y_pred_rand_forest))

In [None]:
with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(log_reg, f)

In [None]:
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rand_forest, f)