In [96]:
import  pandas as pd
import  numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from  sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from  sklearn.preprocessing import StandardScaler, OneHotEncoder,MinMaxScaler
from  sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import seaborn as sns

In [97]:
price = pd.read_csv("house_Price.csv")


def clean_data(df = pd.DataFrame):
    #turning True to 1 and False to 0
    df["Elevator"] = df["Elevator"].map({True:1, False:0})
    df["Warehouse"] = df["Warehouse"].map({True:1, False:0})
    df["Parking"] = df["Parking"].map({True:1, False:0})

    #Removing unnecessary columns
    df.drop(columns =["Price", "Address"], inplace = True)

    #Removing commas from Area column and converting to integer
    df["Area"] = df["Area"].str.replace(",", "")
    df["Area"] = df["Area"].astype("Int64")

    #adding the mean distance for missing values
    df["Distance_from_Tehran_km"] = df["Distance_from_Tehran_km"].fillna(df["Distance_from_Tehran_km"].mean())

    #removing Area that is 1,000,000 or greater cause it doesn't make sense
    #outliers
    df = df.loc[df["Area"] < 1000000]  
    df = df.copy()
    # feature engineering
    #df["Price_per_Area"] = df["Price(USD)"] / df["Area"]
    df["Facilities"] = df["Elevator"] + df["Warehouse"] + df["Parking"]
    df["Area_per_Room"] = np.where(df["Room"] == 0, 0, df["Area"] / df["Room"])
    df["Accessibility_Score"] = 1 / (df["Distance_from_Tehran_km"] + 1)
    
    return df

data = clean_data(price)


In [98]:
#Making Flash Cards Columns that are to be predicted
x = data.drop(columns=["Price(USD)"])
y = data["Price(USD)"]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25, random_state=50)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


In [99]:
def choose_best_model(x_train, y_train):
    param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]}

    model = RandomForestRegressor()
    grid = GridSearchCV(model, param_grid, cv=5, scoring="r2", n_jobs=-1)
    grid.fit(x_train, y_train)
    print("Best parameters:", grid.best_params_)
    print("Best R² score:", grid.best_score_)

    return grid.best_estimator_

trained_model = choose_best_model(x_train, y_train)


Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best R² score: 0.7594453266046857


In [100]:
#Prediction and evaluation

def evaluate_model(model, x_test, y_test):
    predict = model.predict(x_test)
    mse = mean_squared_error(y_test,predict)
    mae = mean_absolute_error(y_test, predict)
    score  = model.score(x_test,y_test)
    

    return mse, mae,  score

mse,mae, score = evaluate_model(trained_model, x_test, y_test)

Smse = np.sqrt(mse)
Smse = round(Smse, 2)
mae = round(mae, 2)




print(f'mean_squared_error: {Smse:,}')
print(f'mean_absolute_error: {mae:,}')
print(f"score: {score*100:.2f}%  accuracy")


mean_squared_error: 135,066.03
mean_absolute_error: 56,502.19
score: 67.67%  accuracy
