# Preparing the dataset 1 for training (OHE and standardizarion)

## Import libraries and create the needed functions

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
def ohe_and_standized(df):
    # Separate numerical and categorical features
    numerical_features = df.select_dtypes(include=['int', 'float']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    
    # Create transformers for preprocessing
    numerical_transformer = Pipeline([('scaler', StandardScaler())])
    categorical_transformer = Pipeline([('onehot', OneHotEncoder(sparse_output=False))])  # Set sparse=False to get dense array
    
    # Combine transformers into a preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # Preserve the remaining features
    )
        
    # Create a pipeline to apply preprocessing
    pipeline = Pipeline([('preprocessor', preprocessor)])
    transformed_data = pipeline.fit_transform(df)
    
    # Get feature names after preprocessing
    feature_names = []
    feature_names.extend(numerical_features)
    
    # Add one-hot encoded categorical feature names
    if 'cat' in preprocessor.named_transformers_ and categorical_features:
        one_hot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
        categorical_feature_names = one_hot_encoder.get_feature_names_out(input_features=categorical_features)
        feature_names.extend(categorical_feature_names)
    
    # Add any other remaining columns (e.g., columns you didn't process)
    remaining_cols = [col for col in df.columns if col not in numerical_features + categorical_features]
    feature_names.extend(remaining_cols)
    
    # Convert the transformed data back to a DataFrame for visualization
    transformed_df = pd.DataFrame(transformed_data, columns=feature_names)
    
    # set the indexes as the original dataset
    transformed_df = transformed_df.set_index(df.index)
    
    return transformed_df

In [3]:
df = pd.read_csv('./dataset1.csv', encoding='latin1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2188 entries, 0 to 2187
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Year                      2188 non-null   object 
 1   County                    2186 non-null   object 
 2   Hospital                  2188 non-null   object 
 3   OSHPDID                   2184 non-null   object 
 4   Measure                   2188 non-null   object 
 5   Risk Adjusted Rate        2178 non-null   float64
 6   # of Deaths/Readmissions  2178 non-null   float64
 7   # of Cases                2178 non-null   float64
 8   Hospital Ratings          2170 non-null   object 
 9   Location 1                2180 non-null   object 
dtypes: float64(3), object(7)
memory usage: 171.1+ KB


### Our target will be hospitals rating

In [4]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2170 entries, 2 to 2187
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Year                      2170 non-null   object 
 1   County                    2170 non-null   object 
 2   Hospital                  2170 non-null   object 
 3   OSHPDID                   2170 non-null   object 
 4   Measure                   2170 non-null   object 
 5   Risk Adjusted Rate        2170 non-null   float64
 6   # of Deaths/Readmissions  2170 non-null   float64
 7   # of Cases                2170 non-null   float64
 8   Hospital Ratings          2170 non-null   object 
 9   Location 1                2170 non-null   object 
dtypes: float64(3), object(7)
memory usage: 186.5+ KB


In [5]:

# Mapping dictionary
ratings_mapping = {
    'As Expected': 0,
    'Better': 1,
    'Worse': -1
}

# Apply the mapping and explicitly downcast
df['Hospital Ratings'] = df['Hospital Ratings'].map(ratings_mapping)


### We will also separate the location into latitude and longitude

In [6]:
# Remove parentheses and split into latitude and longitude
df[['latitude', 'longitude']] = df['Location 1'].str.strip('()').str.replace(',', ', ').str.split(', ', expand=True)

# Convert to appropriate data types
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

# Drop the original 'Location 1' column if no longer needed
df = df.drop(columns=['Location 1'])

In [7]:
df = ohe_and_standized(df)
df.head()

Unnamed: 0,Risk Adjusted Rate,# of Deaths/Readmissions,# of Cases,Hospital Ratings,latitude,longitude,Year_2011-2012,Year_2012-2013,Year_2013-2014,Year_2014-2015,...,OSHPDID_106560492,OSHPDID_106560501,OSHPDID_106560508,OSHPDID_106560525,OSHPDID_106560529,OSHPDID_106571086,OSHPDID_106574010,OSHPDID_106580996,Measure_30-day Mortality,Measure_30-day Readmission
2,2.066951,-0.488349,-0.555044,0.012656,0.875213,-1.205917,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.293607,-0.53174,-0.538221,0.012656,0.875213,-1.205917,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.208244,-0.372639,0.504828,0.012656,0.917671,-1.207936,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,-0.367267,-0.444958,0.538474,0.012656,0.917671,-1.207936,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,0.332158,-0.235233,1.166547,0.012656,0.846002,-1.126424,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
df.rename(columns={'Hospital Ratings': 'Target'}, inplace=True)

## create a CSV file again from the procesed data

In [9]:
df.to_csv('procesed_dataset_1.csv', index=False)