In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

In [3]:
# Load data
train_data = pd.read_csv('New_train.csv')

In [5]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [9]:
train_data = train_data[['MSSubClass', 'LotArea', 'HouseStyle', 'RoofStyle', 'TotalBsmtSF', 'FullBath', 'BedroomAbvGr', 'GarageCars', 'SalePrice']]

In [11]:
train_data.head()

Unnamed: 0,MSSubClass,LotArea,HouseStyle,RoofStyle,TotalBsmtSF,FullBath,BedroomAbvGr,GarageCars,SalePrice
0,60,8450,2Story,Gable,856,2,3,2,208500
1,20,9600,1Story,Gable,1262,2,3,2,181500
2,60,11250,2Story,Gable,920,2,3,2,223500
3,70,9550,2Story,Gable,756,1,3,3,140000
4,60,14260,2Story,Gable,1145,2,4,3,250000


In [15]:
train_data.dtypes

MSSubClass       int64
LotArea          int64
HouseStyle      object
RoofStyle       object
TotalBsmtSF      int64
FullBath         int64
BedroomAbvGr     int64
GarageCars       int64
SalePrice        int64
dtype: object

In [13]:
for col in train_data.columns:
    print('Unique values of ' + col)
    print(train_data[col].unique())
    print("======================")

Unique values of MSSubClass
[ 60  20  70  50 190  45  90 120  30  85  80 160  75 180  40]
Unique values of LotArea
[ 8450  9600 11250 ... 17217 13175  9717]
Unique values of HouseStyle
['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']
Unique values of RoofStyle
['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']
Unique values of TotalBsmtSF
[ 856 1262  920  756 1145  796 1686 1107  952  991 1040 1175  912 1494
 1253  832 1004    0 1114 1029 1158  637 1777 1060 1566  900 1704 1484
  520  649 1228 1234 1398 1561 1117 1097 1297 1057 1088 1350  840  938
 1150 1752 1434 1656  736  955  794  816 1842  384 1425  970  860 1410
  780  530 1370  576 1143 1947 1453  747 1304 2223  845 1086  462  672
 1768  440  896 1237 1563 1065 1288  684  612 1013  990 1235  876 1214
  824  680 1588  960  458  950 1610  741 1226 1053  641  789  793 1844
  994 1264 1809 1028  729 1092 1125 1673  728  732 1080 1199 1362 1078
  660 1008  924  992 1063 1267 1461 1907  928  864 1734  910 1490 172

In [17]:
# Initialize a dictionary to store LabelEncoders for each categorical column
label_encoders = {}

# Categorical columns that need encoding
categorical_columns = ['MSSubClass', 'HouseStyle', 'RoofStyle']

In [19]:
# Fit and transform the categorical columns
for col in categorical_columns:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    label_encoders[col] = le  # Store the encoder for later use

    # Print the encoded values for each category
    print(f'Encoded values for {col}:')
    for index, class_ in enumerate(le.classes_):
        print(f'{class_}: {index}')
    print()  # Add a newline for better readability

Encoded values for MSSubClass:
20: 0
30: 1
40: 2
45: 3
50: 4
60: 5
70: 6
75: 7
80: 8
85: 9
90: 10
120: 11
160: 12
180: 13
190: 14

Encoded values for HouseStyle:
1.5Fin: 0
1.5Unf: 1
1Story: 2
2.5Fin: 3
2.5Unf: 4
2Story: 5
SFoyer: 6
SLvl: 7

Encoded values for RoofStyle:
Flat: 0
Gable: 1
Gambrel: 2
Hip: 3
Mansard: 4
Shed: 5



In [21]:
# Features and target variable
X = train_data.drop('SalePrice', axis=1)  # Features
y = train_data['SalePrice']  # Target variable

# Scale numerical features
scaler = StandardScaler()
numeric_columns = ['LotArea', 'TotalBsmtSF', 'FullBath', 'BedroomAbvGr', 'GarageCars']
train_data[numeric_columns] = scaler.fit_transform(train_data[numeric_columns])

In [23]:
print(train_data.head())

   MSSubClass   LotArea  HouseStyle  RoofStyle  TotalBsmtSF  FullBath  \
0           5 -0.207142           5          1    -0.459303  0.789741   
1           0 -0.091886           2          1     0.466465  0.789741   
2           5  0.073480           5          1    -0.313369  0.789741   
3           6 -0.096897           5          1    -0.687324 -1.026041   
4           5  0.375148           5          1     0.199680  0.789741   

   BedroomAbvGr  GarageCars  SalePrice  
0      0.163779    0.311725     208500  
1      0.163779    0.311725     181500  
2      0.163779    0.311725     223500  
3      0.163779    1.650307     140000  
4      1.390023    1.650307     250000  


In [31]:
from sklearn.ensemble import RandomForestRegressor  # Ensure this import is included
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose and train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate the evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")

# Save the model
with open('House_Price_Prediction.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved as 'random_forest_model.pkl'")

Mean Absolute Error (MAE): 23984.24330153294
Mean Squared Error (MSE): 1298638696.8613458
Root Mean Squared Error (RMSE): 36036.6299320753
R-squared (R²): 0.8306931593542246


In [49]:
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

# Load the saved model
with open('House_Price_Prediction.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Example of new data - ensure these values match your feature engineering in training
new_data = pd.DataFrame({
    'MSSubClass': [60],        # Example value
    'LotArea': [8450],         # Example value
    'HouseStyle': ['1Story'],  # Example value
    'RoofStyle': ['Gable'],    # Example value
    'TotalBsmtSF': [856],      # Example value
    'FullBath': [2],           # Example value
    'BedroomAbvGr': [3],       # Example value
    'GarageCars': [2]          # Example value
})

# Preprocessing for new data

# 1. Encode categorical features
# Assuming you used one-hot encoding during training
new_data = pd.get_dummies(new_data, columns=['HouseStyle', 'RoofStyle'], drop_first=True)

# Add missing columns if any were generated during one-hot encoding in the training set
expected_columns = loaded_model.feature_names_in_
for col in expected_columns:
    if col not in new_data.columns:
        new_data[col] = 0

# Ensure the order of columns is the same
new_data = new_data[expected_columns]

# 2. Scale numerical features
numeric_columns = ['LotArea', 'TotalBsmtSF', 'FullBath', 'BedroomAbvGr', 'GarageCars']
scaler = StandardScaler()

# Apply the scaler that was used during training (use transform instead of fit_transform)
new_data[numeric_columns] = scaler.fit_transform(new_data[numeric_columns])

# Make predictions
new_predictions = loaded_model.predict(new_data)

# Print predictions
print(f"Predicted Price: ${new_predictions[0]:,.2f}")


Predicted Price: $67,904.67
