In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/final_dataset.csv")

In [3]:
X = df.drop(columns=['price'])
y = df['price']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.15, random_state=0)

In [5]:
print(X_train.shape)
print(y_train.shape)

(8500, 16)
(8500,)


In [6]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge  # Corrected
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline  # Corrected
from sklearn.metrics import r2_score

In [7]:
num_features = ['squareMeters', 'numberOfRooms', 'floors', 'numPrevOwners', 'ageOfHouse']
ordinal_features = ['cityPartRange']  # Ordinal category (1-10)
bool_features = ['hasYard', 'hasPool', 'isNewBuilt', 'hasStormProtector', 
                 'basement', 'attic', 'garage', 'hasStorageRoom', 'hasGuestRoom']
cat_features = ['cityCode']

# Create column transformer
column_trans = make_column_transformer(
    (StandardScaler(), num_features),  # Scale numerical features
    (OrdinalEncoder(), ordinal_features),  # Ordinal encoding for cityPartRange
    (OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_features),  # One-hot encode categorical
    remainder='passthrough'  # Leave boolean features as they are (0/1)
)

In [8]:
# Define the model
model = LinearRegression()

# Create a full pipeline
pipeline = make_pipeline(column_trans, model)

In [9]:
pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [10]:
y_pred = pipeline.predict(X_test)

In [11]:
y_pred

array([1405206.93258827,  503976.35343573, 6276306.89555466, ...,
       7593973.13320032, 9104437.17084057, 1830210.45031316],
      shape=(1500,))

In [12]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.999999521893989

In [13]:
lasso = Lasso()
scaler = StandardScaler()


In [14]:
pipe = make_pipeline(column_trans, scaler, lasso)
pipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [15]:
y_pred_lasso = pipe.predict(X_test)
r2_score(y_test, y_pred_lasso)

0.9999991106223592

In [18]:
print(pipe.named_steps)

{'columntransformer': ColumnTransformer(remainder='passthrough',
                  transformers=[('standardscaler', StandardScaler(),
                                 ['squareMeters', 'numberOfRooms', 'floors',
                                  'numPrevOwners', 'ageOfHouse']),
                                ('ordinalencoder', OrdinalEncoder(),
                                 ['cityPartRange']),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse_output=False),
                                 ['cityCode'])]), 'standardscaler': StandardScaler(), 'lasso': Lasso()}


In [19]:
print("Lasso coefficients:", pipe.named_steps['lasso'].coef_)

Lasso coefficients: [ 2.88436887e+06  3.71615712e+01  1.55963770e+03 ...  2.73278680e+01
 -5.13278808e+00  2.34150243e+00]


In [None]:
ridge = Ridge()
pipeR = make_pipeline(column_trans, scaler, lasso)
pipeR.fit(X_train, y_train)
y_pred_ridge = pipeR.predict(X_test)
r2_score(y_test, y_pred_ridge)

0.9999991106223592

In [20]:
print("Train R2 Score:", r2_score(y_train, pipeline.predict(X_train)))
print("Test R2 Score:", r2_score(y_test, pipeline.predict(X_test)))

Train R2 Score: 0.9999999820246553
Test R2 Score: 0.999999521893989


In [22]:
feature_names = (
    pipe.named_steps['columntransformer']
    .get_feature_names_out()
)

feature_importance = dict(zip(feature_names, pipe.named_steps['lasso'].coef_))

# Sort by absolute importance
sorted_features = sorted(feature_importance.items(), key=lambda x: abs(x[1]), reverse=True)

In [23]:
for feature, coef in sorted_features[:10]:
    print(f"{feature}: {coef:.2f}")

standardscaler__squareMeters: 2884368.87
standardscaler__floors: 1559.64
remainder__hasYard: 211.00
remainder__hasPool: 209.83
ordinalencoder__cityPartRange: 171.73
onehotencoder__cityCode_16604: 103.44
onehotencoder__cityCode_54430: 99.91
onehotencoder__cityCode_20234: 99.64
onehotencoder__cityCode_13097: 99.34
onehotencoder__cityCode_29754: 99.11


In [24]:
from sklearn.model_selection import cross_val_score

train_score = pipe.score(X_train, y_train)
test_score = pipe.score(X_test, y_test)
cv_score = cross_val_score(pipe, X, y, cv=5).mean()

print(f"Train R²: {train_score:.3f}")
print(f"Test R²: {test_score:.3f}")
print(f"Cross-Validation R²: {cv_score:.3f}")

Train R²: 1.000
Test R²: 1.000
Cross-Validation R²: 1.000


In [25]:
data = pd.DataFrame(X, columns=feature_names)
data["price"] = y 

correlation_matrix = data.corr()["price"].sort_values(ascending=False)
print(correlation_matrix)

price                            1.0
standardscaler__squareMeters     NaN
standardscaler__numberOfRooms    NaN
standardscaler__floors           NaN
standardscaler__numPrevOwners    NaN
                                ... 
remainder__basement              NaN
remainder__attic                 NaN
remainder__garage                NaN
remainder__hasStorageRoom        NaN
remainder__hasGuestRoom          NaN
Name: price, Length: 8155, dtype: float64
