### **1. Import packages**

In [10]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

### **2. Load data**

In [None]:
df = pd.read_csv('./NY-House-Dataset.csv')
df.head()

In [None]:
features = ['BEDS', 'BATH', 'PROPERTYSQFT', 'LATITUDE', 'LONGITUDE', 'PRICE']
for feature in features:
    print(f'Feature: {feature}, Max: {df[feature].max()}, Min: {df[feature].min()}, Mean: {df[feature].mean()}')

### **3. Show information about data**

In [None]:
df.info()

### **4. Data preprocessing**

In [14]:
z_scores = stats.zscore(df['PRICE'])
threshold = 3
df_no_outliers = df[np.abs(z_scores) < threshold]

In [15]:
features = ['BEDS', 'BATH', 'PROPERTYSQFT', 'LATITUDE', 'LONGITUDE']
target = 'PRICE'

X = df_no_outliers[features]
y = df_no_outliers[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'X_train, y_train shape: {X_train.shape} , {y_train.shape}')
print(f'X_test, y_test shape: {X_test.shape}, {y_test.shape}')

### **5. Define the model**

In [17]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

#### **Training Model**

In [None]:
rf_regressor.fit(X_train, y_train)

#### **Prediction**

In [None]:
y_pred = rf_regressor.predict(X_test)
# Evalute the model
mse = root_mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R2 Score: {r2}')

#### **Saving the model**

In [20]:
import pickle

with open('rf_regressor.pkl', 'wb') as model_file:
    pickle.dump(rf_regressor, model_file)