# Diabetes Prediction Model - Data Preprocessing

## Import Libraries

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


## Load Data

In [None]:
file_path = '/path/to/your/dataset.csv'
diabetes_data = pd.read_csv(file_path)


## Preprocessing Steps

In [None]:
# Separating features and target variable
X = diabetes_data.drop('diabetes', axis=1)
y = diabetes_data['diabetes']

# Defining numerical and categorical features
numerical_features = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
categorical_features = ['gender', 'smoking_history']

# Creating transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combining transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Applying the preprocessor
X_preprocessed = preprocessor.fit_transform(X)

# Converting the processed data back to a dataframe
column_names = numerical_features + list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=column_names)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_df, y, test_size=0.2, random_state=42)


## Saving Preprocessed Data

In [None]:
# Saving the preprocessed data to a CSV file
preprocessed_data_path = '/mnt/data/diabetes_preprocessed_data.csv'
X_train.to_csv(preprocessed_data_path, index=False)

## Conclusion

This notebook contains the steps for preprocessing the diabetes prediction dataset. It includes encoding categorical variables, normalizing numerical features, and splitting the data into training and testing sets.