In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
df = pd.read_csv("../data/processed/data_processed_housing_dataset.csv")

In [3]:
df.shape

(4600, 16)

In [4]:

df.head()


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,log_price,zipcode_freq,city_freq
0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,12.653962,93,123
1,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,14.684291,49,1573
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,12.742569,100,185
3,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,12.948012,50,286
4,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,13.217675,135,235


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          4600 non-null   float64
 1   bedrooms       4600 non-null   float64
 2   bathrooms      4600 non-null   float64
 3   sqft_living    4600 non-null   int64  
 4   sqft_lot       4600 non-null   int64  
 5   floors         4600 non-null   float64
 6   waterfront     4600 non-null   int64  
 7   view           4600 non-null   int64  
 8   condition      4600 non-null   int64  
 9   sqft_above     4600 non-null   int64  
 10  sqft_basement  4600 non-null   int64  
 11  yr_built       4600 non-null   int64  
 12  yr_renovated   4600 non-null   int64  
 13  log_price      4600 non-null   float64
 14  zipcode_freq   4600 non-null   int64  
 15  city_freq      4600 non-null   int64  
dtypes: float64(5), int64(11)
memory usage: 575.1 KB


In [6]:
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
print(constant_cols)

[]


In [None]:
# total square foot
df['total_sqft'] = df['sqft_living'] + df['sqft_lot'] + df['sqft_above'] + df['sqft_basement']

In [None]:
# Example: Total square footage (above + basement)


# Age of the house at the time of sale
if 'yr_built' in df.columns and 'yr_sold' in df.columns:
    df['age_at_sale'] = df['yr_sold'] - df['yr_built']

# Years since renovation
if 'yr_renovated' in df.columns:
    df['renovated'] = df['yr_renovated'].apply(lambda x: 0 if x == 0 else 1)
    df['years_since_renovation'] = df['yr_sold'] - df['yr_renovated']
    df['years_since_renovation'] = df['years_since_renovation'].fillna(0)


In [None]:
# Remove highly correlated features to avoid multicollinearity (correlation > 0.9)
corr_matrix = df.corr(numeric_only=True).abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper_tri.columns if any(upper_tri[col] > 0.9)]

df.drop(columns=to_drop, inplace=True)
print(f"Dropped highly correlated features: {to_drop}")


In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

# Split features and target
X = df.drop(columns=['log_price'])
y = df['log_price']

# Select top 30 features using mutual information
selector = SelectKBest(score_func=mutual_info_regression, k=30)
X_selected = selector.fit_transform(X, y)

# Get selected column names
selected_features = X.columns[selector.get_support()]
X = X[selected_features]

print(f"Selected features:\n{selected_features.tolist()}")


In [None]:
# Combine selected features and target
final_df = X.copy()
final_df['log_price'] = y

# Save to processed folder
os.makedirs('../data/final', exist_ok=True)
final_df.to_csv('../data/final/ames_final.csv', index=False)
print("✅ Final dataset with selected features saved to ../data/final/ames_final.csv")


In [None]:
final_df.head()
final_df.shape
