# Preprocess the data

In [1]:
import os
import pandas as pd 
import numpy as np


# Check if the file exists
if os.path.exists(r'C:/Users/HP/Desktop/Med_Cost_Prediction/Data/Processed/cleaned_data.csv'):
    # Import the clean dataset    
    clean_medcost_df = pd.read_csv('C:/Users/HP/Desktop/Med_Cost_Prediction/Data/Processed/cleaned_data.csv')

else:
    print("File not found!")
    
# Display first few rows of the dataset
clean_medcost_df.head()  

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Encoding and reformatting 

### 1. Encode

In [2]:
def encode_columns(df):
    """
    Encodes the 'sex' and 'smoker' columns to 0 and 1 using lambda functions, 
    and encodes the 'region' column using dummy variables in pandas.

    Parameters:
    df (DataFrame): The input DataFrame.

    Returns:
    df (DataFrame): The DataFrame with the encoded columns.
    """
    try:
        # Encode 'sex' and 'smoker' columns to 0 and 1
        df['sex'] = df['sex'].apply(lambda x: 1 if x == 'male' else 0)
        df['smoker'] = df['smoker'].apply(lambda x: 1 if x == 'yes' else 0)

        # Encode 'region' column using dummy variables
        df = pd.get_dummies(df, columns=['region'], drop_first=True)

        return df

    except Exception as e:
        print(f"An error occurred: {e}")


### 2. Change the data types. 

In [3]:
def convert_to_int64_and_save(df, columns, file_path):
    """
    Converts the specified columns to 'int64' and saves the DataFrame to a CSV file.

    Parameters:
    df (DataFrame): The input DataFrame.
    columns (list): The columns to convert.
    file_path (str): The path to save the CSV file.
    """
    try:
        # Convert the specified columns to 'int64'
        for column in columns:
            df[column] = df[column].astype('int64')

        # Save the DataFrame to a CSV file
        df.to_csv(file_path, index=False)

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == '__main__':
    
    # Encode the columns
    processed_medcost_df = encode_columns(clean_medcost_df)

    # Convert the new variables to 'int64' and save the DataFrame
    convert_to_int64_and_save(processed_medcost_df, ['region_northwest', 'region_southeast', 'region_southwest'], 
                              r'C:/Users/HP/Desktop/Med_Cost_Prediction/Data/For_Modeling/processed_data.csv')