In [113]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv('test.csv')

# Step 1: Handle Missing Values
# Drop columns with more than 50% missing values
data = data.dropna(thresh=data.shape[0] * 0.5, axis=1)

# Fill remaining missing values
for column in data.select_dtypes(include=['object']).columns:
    data[column] = data[column].fillna(data[column].mode()[0])
for column in data.select_dtypes(include=['float64', 'int64']).columns:
    data[column] = data[column].fillna(data[column].mean())

# Step 2: Encode Categorical Variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Step 3: Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.select_dtypes(include=['float64', 'int64']))
data_scaled = pd.DataFrame(scaled_features, columns=data.select_dtypes(include=['float64', 'int64']).columns)

# Step 4: Split the Dataset
# Allow user to specify the target variable
default_target_column = 'GrLivArea'  # Replace with your preferred default target column
target_column = input(f"Enter the target column (default is '{default_target_column}'): ") or default_target_column

if target_column in data.columns:
    X = data_scaled.drop(target_column, axis=1)
    y = data[target_column]
else:
    print("Target column not found. Available columns:")
    print(data.columns.tolist())
    raise ValueError(f"Target column '{target_column}' not found. Please specify the correct target column from the list above.")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train a Model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 6: Evaluate the Model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Root Mean Squared Error:", rmse)


Enter the target column (default is 'GrLivArea'):  GrLivArea


Root Mean Squared Error: 59.88612922278139
