In [10]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

df = pd.read_csv('rieltor_subways_new2.csv')

# df = df[df['rooms'] <= 3]
# df = df[df['price'] <= 200000]

print(len(df))

#Linear regression

df['flat_area'] = df['flat_area'].astype(int)
df['price'] = df['price'].astype(int)
df['rooms'] = df['rooms'].astype(int)

# print(df.head(5))

label_encoder = LabelEncoder()
df['region_name_encoded'] = label_encoder.fit_transform(df['district'])

X = df[['flat_area', 'rooms', 'distance_category', 'code type', 'prestigious']]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
score = model.score(X_test, y_test)

print(score)
print('Mean Squared Error (MSE):', mse)


5633
0.5944251034213837
Mean Squared Error (MSE): 62451421.53013566


In [16]:
#Lasso of the model

from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=1)

lasso_model.fit(X_train, y_train)

y_pred = lasso_model.predict(X_test)

lasso_score = lasso_model.score(X_test, y_test)

lasso_score

0.5951561667857158

In [17]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1)

ridge_model.fit(X_train, y_train)

y_pred = ridge_model.predict(X_test)

ridge_score = ridge_model.score(X_test, y_test)

ridge_score

0.5951518195493385

In [18]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor()

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

score = regressor.score(X_test, y_test)
print("R2 Score:", score)

R2 Score: 0.44631660933976713


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

regressor = RandomForestRegressor(n_estimators=150, max_depth=12, min_samples_split=3, min_samples_leaf=2)

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
# df[['flat_area', 'rooms', 'distance_category', 'code type', 'prestigious']]
print(regressor.predict([[80, 2, 1, 1, 1]]))
print(regressor.predict([[160, 3, 3, 5, 3]]))
print(regressor.predict([[100, 2, 3, 1, 1]]))
print(regressor.predict([[50, 2, 2, 1, 1]]))
mse = mean_squared_error(y_test, y_pred)


scores = cross_val_score(regressor, X_train, y_train, cv=5)  # Perform 5-fold cross-validation
print("Cross-validated R2 scores:", scores)
print("Mean R2 score:", scores.mean())



[6277.09959054]
[39362.665585]
[6474.96142708]
[5020.17310933]
54799098.28374712
Cross-validated R2 scores: [0.65468399 0.66953896 0.62450001 0.66403635 0.62647917]
Mean R2 score: 0.6478476990088741


In [13]:
from sklearn.svm import SVR

regressor = SVR()

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

score = regressor.score(X_test, y_test)
print("R2 Score:", score)

R2 Score: -0.1091736553838838


In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

param_grid = {'fit_intercept': [True, False]}

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)  


best_params = grid_search.best_params_
best_model = LinearRegression(**best_params)
best_model.fit(X_train, y_train)  

# Make predictions on the test data
y_pred = best_model.predict(X_test) 

# Evaluate the model
mse = mean_squared_error(y_test, y_pred) 
r2 = r2_score(y_test, y_pred) 

# Print the evaluation metrics
print('Best Hyperparameters:', best_params)
print('Mean Squared Error:', mse)
print('R2 Score:', r2)


Best Hyperparameters: {'fit_intercept': True}
Mean Squared Error: 64343227.06591935
R2 Score: 0.5784104078382445


In [31]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('rieltor_subways_new2.csv')

# Define parameter grid

param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.7, 0.8],
    'colsample_bytree': [0.6, 0.7, 0.8]
}

# Train XGBoost model with grid search

xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

grid_search = GridSearchCV(
    xgb_model, 
    param_grid=param_grid, 
    cv=5, 
    scoring='neg_mean_squared_error'
)

grid_search.fit(X_train, y_train)

# Evaluate model with best parameters

best_params = grid_search.best_params_
best_model = xgb.XGBRegressor(**best_params)

best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Best parameters:', best_params)
print('R-squared:', r2)
print('Mean Squared Error (MSE):', mse)


Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.7}
R-squared: 0.6646918371336576
Mean Squared Error (MSE): 51631576.80199824
