In [105]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

df = pd.read_csv('rieltor_subways_formatted_2.csv')

df.dropna(subset=['Distance to subway in (m)'], inplace=True)

# df = df[df['rooms'] <= 3]
# df = df[df['price'] <= 200000]

print(len(df))

#Linear regression

df['flat_area'] = df['flat_area'].astype(int)
df['price'] = df['price'].astype(int)
df['rooms'] = df['rooms'].astype(int)
df['Distance to subway in (m)'] = df['Distance to subway in (m)'].astype(float)

bins = [0, 2000, 7000, 15000, 50000, 200000, float('inf')]
# labels = ['very modest', 'modest', 'simple', 'medium', 'high', 'lux']
labels = [1, 2, 3, 4, 5, 6]

# use pd.cut() to create a new column of categories
df['condition'] = pd.cut(df['price'], bins=bins, labels=labels)

# Find the region with the most number of most expensive flats
most_expensive_price = df['price'].max()
most_expensive_flats = df[df['price'] == most_expensive_price]
best_region = most_expensive_flats['district'].value_counts().idxmax()


# print(df.head(5))

label_encoder = LabelEncoder()
df['region_name_encoded'] = label_encoder.fit_transform(df['district'])

X = df[['flat_area', 'region_name_encoded', 'rooms', 'Distance to subway in (m)', 'condition', 'region_rank']]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
score = model.score(X_test, y_test)

print(score)
print('Mean Squared Error (MSE):', mse)


3137
0.5876048100051773
Mean Squared Error (MSE): 401788484.0872408


In [106]:
#Lasso of the model

from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=1)

lasso_model.fit(X_train, y_train)

y_pred = lasso_model.predict(X_test)

lasso_score = lasso_model.score(X_test, y_test)

lasso_score

0.5875981581829528

In [100]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1)

ridge_model.fit(X_train, y_train)

y_pred = ridge_model.predict(X_test)

ridge_score = ridge_model.score(X_test, y_test)

ridge_score

0.6250911407027901

In [101]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor()

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

score = regressor.score(X_test, y_test)
print("R2 Score:", score)

R2 Score: 0.49535337046711436


In [102]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor()

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

score = regressor.score(X_test, y_test)
print("R2 Score:", score)

R2 Score: 0.7607388502146711


In [103]:
from sklearn.svm import SVR

regressor = SVR()

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

score = regressor.score(X_test, y_test)
print("R2 Score:", score)

R2 Score: -0.1165101328373459


In [104]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

param_grid = {'fit_intercept': [True, False]}

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)  


best_params = grid_search.best_params_
best_model = LinearRegression(**best_params)
best_model.fit(X_train, y_train)  

# Make predictions on the test data
y_pred = best_model.predict(X_test) 

# Evaluate the model
mse = mean_squared_error(y_test, y_pred) 
r2 = r2_score(y_test, y_pred) 

# Print the evaluation metrics
print('Best Hyperparameters:', best_params)
print('Mean Squared Error:', mse)
print('R2 Score:', r2)


Best Hyperparameters: {'fit_intercept': True}
Mean Squared Error: 99362508.70620668
R2 Score: 0.6250822923581209
