## Business case
### Help stakeholder to make better financial decisions regarding selling and renovating houses with the potential for big investment returns according to data analysis in order to setup modeling for forecast .. Use KNN model

## Import Cleaned data 

In [22]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [23]:

# import to divide our data into train and test data
from sklearn.model_selection import train_test_split
# import to create polynomial features
RSEED = 12

In [24]:
data_model=pd.read_csv('data/model_data.csv')
data_model.head()

Unnamed: 0.1,Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,...,lat,long,sqft_living15,sqft_lot15,sold_year,sold_month,sold_day,new_renovate,Age_house,new_id
0,0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,,...,47.5112,-122.257,1340,5650,2014,10,0,0,61,1
1,1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0.0,...,47.721,-122.319,1690,7639,2014,12,1,1,65,1
2,2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0.0,...,47.7379,-122.233,2720,8062,2015,2,2,2,83,1
3,3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0.0,...,47.5208,-122.393,1360,5000,2014,12,1,0,51,1
4,4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0.0,...,47.6168,-122.045,1800,7503,2015,2,2,0,29,1


## Split Data Training and Test

In [25]:
# Split data into train and test set
import sklearn.model_selection
X =data_model[['sqft_living','bedrooms','Age_house','zipcode']] ## we have good correlation between price and sqft_living and price and bedrooms Age house,zipcode
y=data_model['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RSEED)

## Scale and normalization 

In [26]:
# Scaling with standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Modeling 
### Model name : KNN 

In [27]:
# Train model
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5, metric='euclidean')
knn.fit(X_train_scaled, y_train)


## Prediction

In [28]:
y_pred_test = knn.predict(X_test_scaled)
y_pred_train = knn.predict(X_train_scaled)

## Evaluation Model

In [38]:
from function import calculate_metrics
print("Results of scale standarization for train data and test data by KNN Model  ")
print("---"*10)
calculate_metrics(y_train,y_pred_train, y_test, y_pred_test)

Results of scale standarization for train data and test data by KNN Model  
------------------------------
Metrics on training data
RMSE: 14412.435
R2: 0.998
------------------------------
Metrics on test data
RMSE: 214640.856
R2: 0.651
------------------------------


### Notice : RMSE value in traning data less than RMSE value in test data that mean our model may be have overfiting to regularizatied better new data or prediction unseen data .

## Error analysis 

In [30]:
import plotly.express as px
import numpy as np

# Assuming you have y_test and y_pred_test as your true and predicted data
# Calculate residuals
residuals = np.array(y_test) - np.array(y_pred_test)

# Create the scatter plot
fig = px.scatter(x=y_pred_test, y=residuals, title='Residual Values vs. Predicted Values',
                 labels={'x': 'Predicted Values', 'y': 'Residuals'})

# Add a horizontal line at y=0 for reference
fig.add_shape(type='line', x0=min(y_pred_test), x1=max(y_pred_test), y0=0, y1=0,
              line=dict(color='red', dash='dash'))
fig.update_layout(width=1200,height=600)

fig.show()

In [31]:
import plotly.express as px
import numpy as np

# Assuming you have y_test and y_pred_test as your true and predicted data

# Create the scatter plot
fig = px.scatter(x=y_pred_test, y=y_test, title='True Values vs. Predicted Values',
                 labels={'x': 'Predicted Values', 'y': 'True Value'})

# Calculate the linear regression line
slope, intercept = np.polyfit(y_pred_test, y_test, 1)
regression_line = slope * y_pred_test + intercept

# Add the linear regression line to the plot
fig.add_scatter(x=y_pred_test, y=regression_line, mode='lines',
                line=dict(color='red', dash='dash'),
                name=f'Linear Regression Line (y={slope:.2f}x + {intercept:.2f})')
fig.update_layout(width=1200,height=600)
fig.show()

## Tune Hyperparameter 

In [32]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV

# Define the KNN model
model = KNeighborsRegressor()

# Define evaluation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Define search space (hyperparameters for KNN)
space = dict()
space['n_neighbors'] = [1, 2, 3, 4, 5]  # Number of neighbors to consider
space['weights'] = ['uniform', 'distance']  # Weighting scheme
space['p'] = [1, 2]  # Power parameter for the Minkowski metric
space['algorithm'] = ['auto', 'ball_tree', 'kd_tree', 'brute']  # Algorithm for nearest neighbors

# Define the randomized search
search = RandomizedSearchCV(model, space, n_iter=500, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv, random_state=1)

# Execute the search
result = search.fit(X_train_scaled, y_train)

# Summarize the result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)


Best Score: -128866.6149657051
Best Hyperparameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 5, 'algorithm': 'ball_tree'}


## Modeling 
### Model name : KNN 

In [33]:
# Train model
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor( weights='distance', p=1, n_neighbors=5, algorithm= 'ball_tree')
knn.fit(X_train_scaled, y_train)

## Prediction

In [34]:
y_pred_test = knn.predict(X_test_scaled)
y_pred_train = knn.predict(X_train_scaled)

## Evaluation Model

In [35]:
from function import calculate_metrics
print("Results of scale standarization for train data and test data by KNN after enter Hyperparameter")
print("---"*10)
calculate_metrics(y_train,y_pred_train, y_test, y_pred_test)

Results of scale standarization for train data and test data by KNN after enter Hyperparameter
------------------------------
Metrics on training data
RMSE: 14412.435
R2: 0.998
------------------------------
Metrics on test data
RMSE: 214640.856
R2: 0.651
------------------------------


## Error Analysis

### Notice : RMSE value in traning data less than RMSE value in test data that mean our model may be have overfiting to regularizatied better new data or prediction unseen data   .

In [36]:
import plotly.express as px
import numpy as np

# Assuming you have y_test and y_pred_test as your true and predicted data
# Calculate residuals
residuals = np.array(y_test) - np.array(y_pred_test)

# Create the scatter plot
fig = px.scatter(x=y_pred_test, y=residuals, title='Residual Values vs. Predicted Values',
                 labels={'x': 'Predicted Values', 'y': 'Residuals Values'})

# Add a horizontal line at y=0 for reference
fig.add_shape(type='line', x0=min(y_pred_test), x1=max(y_pred_test), y0=0, y1=0,
              line=dict(color='red', dash='dash'))
fig.update_layout(width=1200,height=600)

fig.show()

### The  pattern in residual plots are a sign for equilibrium predictio ???

In [37]:
import plotly.express as px
import numpy as np

# Assuming you have y_test and y_pred_test as your true and predicted data

# Create the scatter plot
fig = px.scatter(x=y_pred_test, y=y_test, title='True Values vs. Predicted Values',
                 labels={'x': 'Predicted Values', 'y': 'True Value'})

# Calculate the linear regression line
slope, intercept = np.polyfit(y_pred_test, y_test, 1)
regression_line = slope * y_pred_test + intercept

# Add the linear regression line to the plot
fig.add_scatter(x=y_pred_test, y=regression_line, mode='lines',
                line=dict(color='red', dash='dash'),
                name=f'Linear Regression Line (y={slope:.2f}x + {intercept:.2f})')
fig.update_layout(width=1200,height=600)
fig.show()