<a href="https://colab.research.google.com/github/Deptage/data_mining_project/blob/main/regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#One Hot Encoding

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load data
ured_normalized = pd.read_csv('usa-real-estate-dataset-normalized.csv')

# Define categorical columns
categorical_cols = ['state', 'city']

# Step 1: Select top 2000 most frequent cities
top_cities = (
    ured_normalized['city']
    .value_counts()
    .nlargest(200)
    .index
)

# Replace rare cities with a placeholder
ured_normalized['city'] = ured_normalized['city'].where(ured_normalized['city'].isin(top_cities), other='__other__')

# Step 2: One-hot encode
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
encoded_features = enc.fit_transform(ured_normalized[categorical_cols])

# Get informative feature names
encoded_col_names = enc.get_feature_names_out(categorical_cols)

# Create DataFrame with meaningful names
encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded_features, columns=encoded_col_names)

# Combine with original data
ured_encoded = pd.concat([ured_normalized.drop(columns=categorical_cols), encoded_df], axis=1)

In [2]:
ured_encoded

Unnamed: 0,price,bed,bath,acre_lot,zip_code,house_size,sold_before,years_since_sold,city_population,state_Alabama,...,city_Vancouver,city_Venice,city_Washington,city_West Palm Beach,city_Wilmington,city_Winchester,city_Woodbridge,city_York,city_Yukon,city___other__
0,105000.0,3.0,2.0,1.100000e-06,601.0,0.076813,0,0,0.000227,0,...,0,0,0,0,0,0,0,0,0,1.0
1,80000.0,4.0,2.0,7.000001e-07,601.0,0.108446,0,0,0.000227,0,...,0,0,0,0,0,0,0,0,0,1.0
2,67000.0,2.0,1.0,1.400000e-06,795.0,0.065555,0,0,0.000354,0,...,0,0,0,0,0,0,0,0,0,1.0
3,145000.0,4.0,2.0,9.000001e-07,731.0,0.119433,0,0,0.006060,0,...,0,0,0,0,0,0,0,0,0,1.0
4,179000.0,4.0,3.0,4.500000e-06,612.0,0.141611,0,0,0.000461,0,...,0,0,0,0,0,0,0,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1616930,359900.0,4.0,2.0,3.200000e-06,99354.0,0.162622,1,2,0.003111,0,...,0,0,0,0,0,0,0,0,0,1.0
1616931,350000.0,3.0,2.0,9.000001e-07,99354.0,0.112216,1,2,0.003111,0,...,0,0,0,0,0,0,0,0,0,1.0
1616932,440000.0,6.0,3.0,4.900000e-06,99354.0,0.156118,1,2,0.003111,0,...,0,0,0,0,0,0,0,0,0,1.0
1616933,179900.0,2.0,1.0,8.000001e-07,99354.0,0.077617,1,2,0.003111,0,...,0,0,0,0,0,0,0,0,0,1.0


# Univariate Feature Selection for Regression

In [3]:
X_encoded = ured_encoded.drop(columns=['price'])
y = ured_encoded['price']

In [4]:
from sklearn.feature_selection import SelectKBest, f_regression

selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X_encoded, y)
feature_scores = pd.Series(selector.scores_, index=X_encoded.columns)
print(feature_scores.sort_values(ascending=False))



bath                  167435.796166
house_size            105755.866016
bed                    46826.495396
state_California       28604.914328
city_New York City     23311.510279
                          ...      
city_Chandler              0.107753
city_Dallas                0.088335
city_Bradenton             0.067913
state_Puerto Rico          0.044481
city_Cape Coral            0.039161
Length: 263, dtype: float64


In [1]:
# prompt: OK give me Ordinal Encoding for the columns city, state and zip_code and use a random forest regressor to check for column importance

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Load data
ured_normalized = pd.read_csv('usa-real-estate-dataset-normalized.csv')

# Define categorical columns
categorical_cols = ['city', 'state', 'zip_code']

# Step 1: Select top 200 most frequent cities (as before)
top_cities = (
    ured_normalized['city']
    .value_counts()
    .nlargest(200)
    .index
)
ured_normalized['city'] = ured_normalized['city'].where(ured_normalized['city'].isin(top_cities), other='__other__')


# Step 2: Ordinal Encoding
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)  # Handle unknown values
ured_normalized[categorical_cols] = enc.fit_transform(ured_normalized[categorical_cols])


# Prepare data for RandomForestRegressor
X = ured_normalized.drop('price', axis=1)
y = ured_normalized['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2137)


# Train Random Forest Regressor
rf_model = RandomForestRegressor(random_state=2137, n_estimators=100) # Increased n_estimators for better performance
rf_model.fit(X_train, y_train)


# Get feature importances
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
feature_importance_df


Unnamed: 0,Feature,Importance
0,bed,0.236969
5,zip_code,0.187631
6,house_size,0.167859
1,bath,0.137261
2,acre_lot,0.079442
9,city_population,0.077886
3,city,0.037794
4,state,0.036868
8,years_since_sold,0.034561
7,sold_before,0.003728


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_log_error

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=2137, enable_categorical=True, subsample=0.8, n_estimators=1000, min_child_weight=3, max_depth=7, learning_rate=0.01, gamma=0, colsample_bytree=0.5)

xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")