In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Read the dataset
file_path = './preprocessed_house_features.csv'  # Path to the uploaded file
data = pd.read_csv(file_path)

# Step 2: Impute missing values and encode categorical variables
numeric_features = ['TotalSF', 'GarageInteraction', 'TotalBathrooms', 'TotalPorchArea', 'HouseAge', 
                    'RemodeledAge', 'RoomsPerSF', 'WoodDeckSF', 'OverallCond', 'BsmtUnfSF', 
                    'MasVnrArea', 'LotFrontage', 'TotRmsAbvGrd', 'GrLivArea', 'OverallQual', 
                    'LotArea', 'BsmtFinSF1']

categorical_features = ['GarageFinish']

# Step 3: Preprocessing pipeline for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing values
    ('scaler', StandardScaler())  # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing categorical values
    ('onehot', OneHotEncoder(drop='first'))  # One-hot encode categorical features
])

# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Step 4: Prepare the input (X) and target (y)
X = data.drop(columns=['SalePrice'])  # Features (excluding the target)
y = data['SalePrice']  # Target variable

# Apply the transformations
X_preprocessed = preprocessor.fit_transform(X)

# Step 5: Save the preprocessed dataset
# Convert the preprocessed array to a DataFrame with appropriate column names
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=preprocessor.get_feature_names_out())
final_data = pd.concat([X_preprocessed_df, y.reset_index(drop=True)], axis=1)

# Save the final preprocessed data to a new CSV file
final_output_path = './final_preprocessed_house_features.csv'
final_data.to_csv(final_output_path, index=False)

print(f"Final preprocessed file saved as {final_output_path}")


Final preprocessed file saved as ./final_preprocessed_house_features.csv
