# Preparing the dataset4 for training (OHE and standardizarion)

## Import libraries and create the needed functions

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
def ohe_and_standized(df):
    # Separate numerical and categorical features
    numerical_features = df.select_dtypes(include=['int', 'float']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    
    # Create transformers for preprocessing
    numerical_transformer = Pipeline([('scaler', StandardScaler())])
    categorical_transformer = Pipeline([('onehot', OneHotEncoder(sparse_output=False))])  # Set sparse=False to get dense array
    
    # Combine transformers into a preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # Preserve the remaining features
    )
        
    # Create a pipeline to apply preprocessing
    pipeline = Pipeline([('preprocessor', preprocessor)])
    transformed_data = pipeline.fit_transform(df)
    
    # Get feature names after preprocessing
    feature_names = []
    feature_names.extend(numerical_features)
    
    # Add one-hot encoded categorical feature names
    if 'cat' in preprocessor.named_transformers_ and categorical_features:
        one_hot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
        categorical_feature_names = one_hot_encoder.get_feature_names_out(input_features=categorical_features)
        feature_names.extend(categorical_feature_names)
    
    # Add any other remaining columns (e.g., columns you didn't process)
    remaining_cols = [col for col in df.columns if col not in numerical_features + categorical_features]
    feature_names.extend(remaining_cols)
    
    # Convert the transformed data back to a DataFrame for visualization
    transformed_df = pd.DataFrame(transformed_data, columns=feature_names)
    
    # set the indexes as the original dataset
    transformed_df = transformed_df.set_index(df.index)
    
    return transformed_df

In [3]:
df = pd.read_csv('./dataset4.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205920 entries, 0 to 205919
Data columns (total 13 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Year                               205920 non-null  int64  
 1   Cause of Death                     205920 non-null  object 
 2   State                              205920 non-null  object 
 3   State FIPS Code                    205920 non-null  object 
 4   HHS Region                         205920 non-null  int64  
 5   Age Range                          205920 non-null  object 
 6   Benchmark                          205920 non-null  object 
 7   Locality                           205920 non-null  object 
 8   Observed Deaths                    195708 non-null  float64
 9   Population                         200640 non-null  float64
 10  Expected Deaths                    195708 non-null  float64
 11  Potentially Excess Deaths          1957

### From the 200920 instances only 195708 have actual data the rest have dummy data like year,State but the number of deaths is missing so that data is unnecessary.

In [4]:
df_without_nulls = df.dropna()
df_without_nulls.info()

<class 'pandas.core.frame.DataFrame'>
Index: 195708 entries, 0 to 205919
Data columns (total 13 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Year                               195708 non-null  int64  
 1   Cause of Death                     195708 non-null  object 
 2   State                              195708 non-null  object 
 3   State FIPS Code                    195708 non-null  object 
 4   HHS Region                         195708 non-null  int64  
 5   Age Range                          195708 non-null  object 
 6   Benchmark                          195708 non-null  object 
 7   Locality                           195708 non-null  object 
 8   Observed Deaths                    195708 non-null  float64
 9   Population                         195708 non-null  float64
 10  Expected Deaths                    195708 non-null  float64
 11  Potentially Excess Deaths          195708 no

### We are only interested in the data for Stroke so we will remove the rest of the data.

In [5]:
df_tidy = df_without_nulls[df_without_nulls['Cause of Death'] == 'Stroke']

In [6]:
count = 0
for index, row in df_tidy.iterrows():
    if row['Observed Deaths'] - row['Potentially Excess Deaths'] - row['Expected Deaths']< row['Observed Deaths']/1000:
        count += 1

print("how many instances are not in the 0.1% of the Observed:")
print(df_tidy.shape[0]- count)

how many instances are not in the 0.1% of the Observed:
3


##### Because we removed the other causes of death we dont need that feature.
##### Also the feature 'State FIPS Code' does give us any new information that 'State' didn't jet provided so we will remove this feature too.
##### 'Percent Potentially Excess Deaths' is just 'Potentially Excess Deaths'/'Observed Deaths' so we don't need it either.
##### 'Potentially Excess Deaths' are 'Observed Deaths' - 'Expected Deaths' so we will remove it too.

In [7]:
df_tidy = df_tidy.drop(["Percent Potentially Excess Deaths","Cause of Death","State FIPS Code", "Potentially Excess Deaths"],axis = 1)
df_tidy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37971 entries, 3871 to 202175
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             37971 non-null  int64  
 1   State            37971 non-null  object 
 2   HHS Region       37971 non-null  int64  
 3   Age Range        37971 non-null  object 
 4   Benchmark        37971 non-null  object 
 5   Locality         37971 non-null  object 
 6   Observed Deaths  37971 non-null  float64
 7   Population       37971 non-null  float64
 8   Expected Deaths  37971 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 2.9+ MB


In [8]:
df_tidy = ohe_and_standized(df_tidy)
df_tidy.head()

Unnamed: 0,Year,HHS Region,Observed Deaths,Population,Expected Deaths,State_Alabama,State_Alaska,State_Arizona,State_Arkansas,State_California,...,Age Range_0-69,Age Range_0-74,Age Range_0-79,Age Range_0-84,Benchmark_2005 Fixed,Benchmark_2010 Fixed,Benchmark_Floating,Locality_All,Locality_Metropolitan,Locality_Nonmetropolitan
3871,-1.579099,1.351281,1.250827,0.930826,1.22023,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4216,-1.579099,-0.476644,0.695579,0.327557,0.820406,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4856,-1.579099,-0.476644,0.4122,0.048934,0.281764,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
5345,-1.579099,-0.476644,0.273148,-0.098378,0.072688,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
5647,-1.579099,1.716865,-0.190518,-0.222832,-0.176778,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [9]:
df_tidy.rename(columns={'Expected Deaths': 'Target'}, inplace=True)

## create a CSV file again from the procesed data

In [10]:
df_tidy.to_csv('procesed_dataset_4.csv', index=False)