In [1]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder 

In [3]:
df = pd.read_csv('train.csv')

In [4]:
categorical_columns = df.select_dtypes(include=['object']).columns

In [5]:
# Encode categorical data
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le



In [6]:

#Fill missing numerical values with median
df.fillna(df.median(), inplace=True)

In [7]:
df = df.astype(float)


In [8]:
# Example: Remove outliers in 'GrLivArea'
df = df[df['GrLivArea'] < 4000]

In [9]:
df['PricePerSqFt'] = df['SalePrice'] / df['GrLivArea']

In [10]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,PricePerSqFt
0,1.0,60.0,3.0,65.0,8450.0,1.0,2.0,3.0,3.0,0.0,...,3.0,4.0,4.0,0.0,2.0,2008.0,8.0,4.0,208500.0,121.929825
1,2.0,20.0,3.0,80.0,9600.0,1.0,2.0,3.0,3.0,0.0,...,3.0,4.0,4.0,0.0,5.0,2007.0,8.0,4.0,181500.0,143.819334
2,3.0,60.0,3.0,68.0,11250.0,1.0,2.0,0.0,3.0,0.0,...,3.0,4.0,4.0,0.0,9.0,2008.0,8.0,4.0,223500.0,125.139978
3,4.0,70.0,3.0,60.0,9550.0,1.0,2.0,0.0,3.0,0.0,...,3.0,4.0,4.0,0.0,2.0,2006.0,8.0,0.0,140000.0,81.537566
4,5.0,60.0,3.0,84.0,14260.0,1.0,2.0,0.0,3.0,0.0,...,3.0,4.0,4.0,0.0,12.0,2008.0,8.0,4.0,250000.0,113.739763


Model Training

In [12]:
from sklearn.model_selection import train_test_split

X = df.drop('SalePrice', axis=1)
y = df['SalePrice']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
#train models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [15]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

In [16]:
# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)


In [17]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    rmse = mean_squared_error(y, y_pred, squared=False)
    r2 = r2_score(y, y_pred)
    return rmse, r2

In [18]:
print("Linear Regression:", evaluate_model(lr, X_val, y_val))

Linear Regression: (15357.6972176337, 0.9550609633332893)


In [19]:
print("Random Forest:", evaluate_model(rf, X_val, y_val))

Random Forest: (10854.885021921164, 0.9775497070920511)


#Hyperparameter Tuning 

In [21]:
from sklearn.model_selection import GridSearchCV

In [26]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4] 
}

In [27]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

In [28]:
best_model = grid_search.best_estimator_

In [29]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best Parameters:  {'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
Best Score:  -334021883.9089541


In [30]:
print(":", evaluate_model(best_model, X_val, y_val))

: (11801.339942552675, 0.9734640770512738)


In [31]:
import joblib

joblib.dump(best_model, 'house_price_model.pkl')

['house_price_model.pkl']