# Linear Regression with pandas and scikit-learn

In [28]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('./data/ml_ready_real_estate_data_with_price_per_m2.csv')
# print(df.head())
print(df.shape)
X = df[['bedroomCount', 'bathroomCount', 'habitableSurface', 'toiletCount', 
        'postCode', 'hasGarden_encoded', 'hasTerrace_encoded','hasSwimmingPool_encoded',
         'hasBasement_encoded', 'hasLift_encoded', 'price_per_m2']]   # independent variables
y = df['price'] # dependent variable   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print("Feature names:", X.columns.tolist())


(76103, 29)
Mean Squared Error: 72659545553.583
R² Score: 0.5755234021720265
Intercept: -342343.7561373648
Coefficients: [ 6.71749158e+04  1.25134914e+05  8.55555287e+01  5.81614532e+04
 -4.34663011e+00  1.03593230e+04  1.99972184e+04  3.40827879e+05
 -1.65525831e+02 -3.93486454e+04  1.03852763e+02]
Feature names: ['bedroomCount', 'bathroomCount', 'habitableSurface', 'toiletCount', 'postCode', 'hasGarden_encoded', 'hasTerrace_encoded', 'hasSwimmingPool_encoded', 'hasBasement_encoded', 'hasLift_encoded', 'price_per_m2']


# Data Engineering

In [41]:
import pandas as pd
df_original = pd.read_csv("./data/cleaned_real_estat_data_with_price_per_m2.csv", delimiter=";")
print(df_original.shape)
df = pd.read_csv("./data/cleaned_real_estat_data_with_price_per_m2.csv", on_bad_lines='skip')
print(df.shape)

(76103, 36)
(76073, 1)
