# Preparing the dataset2 for training (OHE and standardizarion)

## Import libraries and create the needed functions

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
def ohe_and_standized(df):
    # Separate numerical and categorical features
    numerical_features = df.select_dtypes(include=['int', 'float']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    
    # Create transformers for preprocessing
    numerical_transformer = Pipeline([('scaler', StandardScaler())])
    categorical_transformer = Pipeline([('onehot', OneHotEncoder(sparse_output=False))])  # Set sparse=False to get dense array
    
    # Combine transformers into a preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # Preserve the remaining features
    )
        
    # Create a pipeline to apply preprocessing
    pipeline = Pipeline([('preprocessor', preprocessor)])
    transformed_data = pipeline.fit_transform(df)
    
    # Get feature names after preprocessing
    feature_names = []
    feature_names.extend(numerical_features)
    
    # Add one-hot encoded categorical feature names
    if 'cat' in preprocessor.named_transformers_ and categorical_features:
        one_hot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
        categorical_feature_names = one_hot_encoder.get_feature_names_out(input_features=categorical_features)
        feature_names.extend(categorical_feature_names)
    
    # Add any other remaining columns (e.g., columns you didn't process)
    remaining_cols = [col for col in df.columns if col not in numerical_features + categorical_features]
    feature_names.extend(remaining_cols)
    
    # Convert the transformed data back to a DataFrame for visualization
    transformed_df = pd.DataFrame(transformed_data, columns=feature_names)
    
    # set the indexes as the original dataset
    transformed_df = transformed_df.set_index(df.index)
    
    return transformed_df

In [3]:
df = pd.read_csv('./dataset2.csv')
#df.info()

### Scroling trow the data I say that the featues 'lsoa11nm' and 'lsoa11nmw' so lets check if it is true.

In [4]:
are_different = (df['lsoa11nm'] != df['lsoa11nmw']).any()

# Print the result
if are_different:
    print("lsoa11nm and lsoa11nmw have different values in some instances.")
else:
    print("lsoa11nm and lsoa11nmw have the same values in all instances.")

lsoa11nm and lsoa11nmw have the same values in all instances.


### Lets check how many different values these features have.

In [5]:
print(df['lsoa11nm'].nunique())
print(df['ogc_fid'].nunique())
print(df['lsoa11cd'].nunique())

190
190
190


### 'ogc_fid', 'lsoa11cd', 'lsoa11nm', 'lsoa11nmw'  have different value for every instances( 190 different values) so it doesn’t give us any more info than a basic indexer.

In [6]:
df = df.drop(["ogc_fid","lsoa11cd","lsoa11nm","lsoa11nmw"],axis = 1)
df.head()

Unnamed: 0,GPRegPop,Hypertens,Anxiety,Depression,Asthma,Obesity,Diabetes,CHD,Fall,Cancer,CKD,COPD,Stroke_TIA,AF
0,1495,165,163,175,110,102,63,47,51,25,17,36,33,19
1,1457,203,191,173,92,90,62,58,80,38,35,35,33,29
2,1343,190,191,171,104,96,68,53,50,25,24,29,23,21
3,1391,269,131,100,102,125,54,51,33,43,39,31,23,18
4,1459,265,161,133,92,124,96,87,85,41,58,50,51,38


In [18]:
df = ohe_and_standized(df)
df.head()

Unnamed: 0,GPRegPop,Hypertens,Anxiety,Depression,Asthma,Obesity,Diabetes,CHD,Fall,Cancer,CKD,COPD,Stroke_TIA,AF
0,-0.238206,-1.442131,0.217429,0.456886,0.306791,0.051091,-0.771539,-0.843554,-0.386809,-1.443102,-1.740483,-0.008236,0.056659,-1.128662
1,-0.452517,-0.518492,0.718173,0.425053,-0.486275,-0.271591,-0.821282,-0.212549,0.693433,-0.402687,-0.284404,-0.057136,0.056659,-0.208689
2,-1.095451,-0.834473,0.718173,0.39322,0.042436,-0.11025,-0.522825,-0.49937,-0.424058,-1.443102,-1.17423,-0.350538,-0.784368,-0.944667
3,-0.824742,1.085724,-0.354851,-0.736848,-0.045682,0.669567,-1.219226,-0.614098,-1.057303,-0.002527,0.039169,-0.252737,-0.784368,-1.22066
4,-0.441238,0.988499,0.181661,-0.211605,-0.486275,0.642677,0.869978,1.45101,0.879681,-0.162591,1.576142,0.676369,1.570506,0.619287


In [7]:
df.rename(columns={'Stroke_TIA': 'Target'}, inplace=True)

## create a CSV file again from the procesed data

In [9]:
df.to_csv('procesed_dataset_2.csv', index=False)