# Real Estate Prices
The purpose of this machine learning model is to predict fair market prices for real estate sales and rentals. 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
df = pd.read_csv("AmesHousing.csv")

# Data exploration 
print(df.head())
print(df.info())
print(df.columns)

   Order        PID  MS SubClass MS Zoning  Lot Frontage  Lot Area Street  \
0      1  526301100           20        RL         141.0     31770   Pave   
1      2  526350040           20        RH          80.0     11622   Pave   
2      3  526351010           20        RL          81.0     14267   Pave   
3      4  526353030           20        RL          93.0     11160   Pave   
4      5  527105010           60        RL          74.0     13830   Pave   

  Alley Lot Shape Land Contour  ... Pool Area Pool QC  Fence Misc Feature  \
0   NaN       IR1          Lvl  ...         0     NaN    NaN          NaN   
1   NaN       Reg          Lvl  ...         0     NaN  MnPrv          NaN   
2   NaN       IR1          Lvl  ...         0     NaN    NaN         Gar2   
3   NaN       Reg          Lvl  ...         0     NaN    NaN          NaN   
4   NaN       IR1          Lvl  ...         0     NaN  MnPrv          NaN   

  Misc Val Mo Sold Yr Sold Sale Type  Sale Condition  SalePrice  
0       

In [3]:
# One-hot encode one column for each neighborhood and rejoin to dataframe 
ohe = pd.get_dummies(df['Neighborhood'], prefix="Neighborhood")

# Dimensionality reduction to 4 critical factors 
X = df[['Lot Area', 'Year Built', 'Gr Liv Area']]
X = X.join(ohe)

y = df['SalePrice']

print(X.columns)


Index(['Lot Area', 'Year Built', 'Gr Liv Area', 'Neighborhood_Blmngtn',
       'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_BrkSide',
       'Neighborhood_ClearCr', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor',
       'Neighborhood_Edwards', 'Neighborhood_Gilbert', 'Neighborhood_Greens',
       'Neighborhood_GrnHill', 'Neighborhood_IDOTRR', 'Neighborhood_Landmrk',
       'Neighborhood_MeadowV', 'Neighborhood_Mitchel', 'Neighborhood_NAmes',
       'Neighborhood_NPkVill', 'Neighborhood_NWAmes', 'Neighborhood_NoRidge',
       'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_SWISU',
       'Neighborhood_Sawyer', 'Neighborhood_SawyerW', 'Neighborhood_Somerst',
       'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker'],
      dtype='object')


In [4]:
#Training data v Test data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=32)


In [5]:
# Creates a LinearRegression model,
mlr = LinearRegression()

In [6]:
# Finds the coefficients(m) and the intercept value(b)
mlr.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [7]:
# Inputs values calculated by `.fit()` and the `x` values, plugs them into the multiple linear regression equation, and calculates the predicted y values.
y_predict = mlr.predict(X_test)

In [8]:
# Model 
# Housing features to see a fair market price

features = [
    'Lot Area', 'Year Built', 'Gr Liv Area',
    'Neighborhood_Blmngtn', 'Neighborhood_Blueste', 'Neighborhood_BrDale',
    'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr',
    'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_Gilbert',
    'Neighborhood_Greens', 'Neighborhood_GrnHill', 'Neighborhood_IDOTRR',
    'Neighborhood_Landmrk', 'Neighborhood_MeadowV', 'Neighborhood_Mitchel',
    'Neighborhood_NAmes', 'Neighborhood_NPkVill', 'Neighborhood_NWAmes',
    'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_OldTown',
    'Neighborhood_SWISU', 'Neighborhood_Sawyer', 'Neighborhood_SawyerW',
    'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber',
    'Neighborhood_Veenker'
]

# Ask user for input
neighborhood = input("Enter neighborhood (e.g., ClearCr, NWAmes, Veenker): ")
lot_area = input("Enter lot area (e.g., 3000, 5000, 7000): ")
year_built = input("Enter year built (e.g., 1925, 1975, 2025): ")
gr_liv_area = input("Enter square feet of the house (e.g., 1000, 2000, 3000): ")

# Build a dataframe with zeros
row = pd.DataFrame([[0] * len(features)], columns=features)

# Fill numeric features
row.loc[0, 'Lot Area'] = lot_area
row.loc[0, 'Year Built'] = year_built
row.loc[0, 'Gr Liv Area'] = gr_liv_area

# Activate the correct one-hot neighborhood
col_name = f'Neighborhood_{neighborhood}'
if col_name in row.columns:
    row.loc[0, col_name] = 1
else:
    print(f"Warning: {neighborhood} not a valid neighborhood")

# Predict
prediction = mlr.predict(row)
print("Predicted price:", "$",round(prediction[0], 2))

Predicted price: $ 330882.89


  row.loc[0, 'Lot Area'] = lot_area
  row.loc[0, 'Year Built'] = year_built
  row.loc[0, 'Gr Liv Area'] = gr_liv_area


In [9]:
# Evaluation 
print("Train score:", mlr.score(X_train, y_train))
print("Test score:", mlr.score(X_test, y_test))

Train score: 0.7698621585429828
Test score: 0.7561200284046441


# Next Steps 
## Evaluation 
Competition accuracy should be > 90% 
Objective good: Test R² ≥ 0.80, RMSE ≤ $20k.
For prototype models > 70% is satisfactory 
If train R² ≈ 0.95 but test R² ≈ 0.70, you’re overfitting.
If train R² is +- 5% of the test R², the model generalizes well, even if it’s not state-of-the-art.
Tree-based models (Random Forest, Gradient Boosting, XGBoost, LightGBM, CatBoost) can push R² to 0.85–0.95

## Refactored Selling Price Model 
x = sales_features = ['address', 'city', 'state', 'zip_code', 'latitude', 'longitude', 'acres', 'lot_size', 'square_feet', 'building_age', ['amenities'], ['schools'], ['universities']]
y = selling_price = []

## Income Based Valuation Model 
x = income_features = ['revenue', 'salaries & fees', 'property_taxes', 'insurance', 'repairs_and_maintenance']
y = income_based_valuation = []

## Rent Prediction Model
x = rent_features = ['square_feet', 'bedrooms', 'bathrooms', 'building_age']
y = rent = []