<a href="https://colab.research.google.com/github/BognarAndras/corvinus_ds_ws/blob/main/data_scientist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd


training_data_url = "https://github.com/BognarAndras/corvinus_ds_ws/raw/refs/heads/main/training_data_science.csv"
training_data_df = pd.read_csv(training_data_url)

testing_data_url = "https://github.com/BognarAndras/corvinus_ds_ws/raw/refs/heads/main/testing_data_science.csv"
testing_data_df = pd.read_csv(testing_data_url)


In [4]:
training_data_df.sort_values("price",ascending=False)

Unnamed: 0,price,size,rooms,from_owner,great_view,under_priced,only_here,town,address,district
142,55000000,30,2,0,0,0,0,Budapest,"22. kerület, (Budafok), Vessző utca 11/a",22
245,35000000,15,1,0,0,0,0,Budapest,"19. kerület, (Kispesti lakótelep), Nagysándor ...",19
58,1050060,115,3,0,0,0,0,Budapest,"5. kerület, (Lipótváros), Szabadság tér",5
54,795500,160,4,0,0,0,0,Budapest,"5. kerület, (Lipótváros), Bajcsy-Zsilinszky út",5
55,795500,140,4,0,0,0,0,Budapest,"5. kerület, (Belváros), Párizsi utca",5
...,...,...,...,...,...,...,...,...,...,...
132,39000,12,1,0,0,0,0,Budapest,"21. kerület, (Csepel – Ady Endre úti lakótelep)",21
214,35000,10,1,0,0,0,0,Budapest,"10. kerület, (Kőbánya – Újhegyi lakótelep), Ta...",10
146,35000,25,1,0,0,0,0,Budapest,"22. kerület, (Budatétény), Jókai Mór utca",22
134,30000,18,1,0,0,0,0,Budapest,"21. kerület, (Csepel-Ófalu), II. Rákóczi Feren...",21


#### Note very high values, what is the reason?


In [11]:
quartiles = training_data_df['price'].quantile([0.05, 0.5, 0.95, 0.99])
quartiles

Unnamed: 0,price
0.05,55000.0
0.5,140000.0
0.95,468788.25
0.99,915143.2


In [13]:
p99 = training_data_df['price'].quantile(0.99)
training_data_df[training_data_df['price'] >p99]

Unnamed: 0,price,size,rooms,from_owner,great_view,under_priced,only_here,town,address,district
58,1050060,115,3,0,0,0,0,Budapest,"5. kerület, (Lipótváros), Szabadság tér",5
142,55000000,30,2,0,0,0,0,Budapest,"22. kerület, (Budafok), Vessző utca 11/a",22
245,35000000,15,1,0,0,0,0,Budapest,"19. kerület, (Kispesti lakótelep), Nagysándor ...",19


In [14]:
p99 = training_data_df['price'].quantile(0.99)

# Filter the DataFrame
training_data_df = training_data_df[training_data_df['price'] < p99]

In [15]:
p99t = testing_data_df['price'].quantile(0.99)

# Filter the DataFrame
testing_data_df = testing_data_df[testing_data_df['price'] < p99t]

#### Modelling steps

1. Feature selection
2. Feature cleaning, enhancement
3. Model selection

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score

# Step 1: create target variable
training_data_df['price_per_m2'] = training_data_df['price'] / training_data_df['size']
testing_data_df['price_per_m2']  = testing_data_df['price']  / testing_data_df['size']

# Step 2: define features and target
features = ['size', 'rooms', 'district', 'great_view', 'from_owner', 'under_priced', 'only_here']
target = 'price_per_m2'

X_train = training_data_df[features]
y_train = training_data_df[target]

X_test = testing_data_df[features]
y_test = testing_data_df[target]

# Step 3: preprocess categorical features
categorical_features = ['district', 'great_view', 'from_owner', 'under_priced', 'only_here']  # all categorical
numeric_features = ['size', 'rooms']  # numeric features

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ],
    remainder='passthrough'
)

# Step 4: build pipeline with regressor
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Step 5: train the model on early data
model.fit(X_train, y_train)

# Step 6: predict on late/testing data
y_pred = model.predict(X_test)

# Step 7: evaluate
print("R² score on test data:", r2_score(y_test, y_pred))
print("Mean Absolute Error on test data:", mean_absolute_error(y_test, y_pred))

# Step 8: estimate overall market price change
market_change_pct = (y_test.mean() / y_pred.mean() - 1) * 100
print(f"Estimated market price change: {market_change_pct:.2f}%")


R² score on test data: -0.0033081574588906815
Mean Absolute Error on test data: 6160.273480515005
Estimated market price change: 218.21%


#### Mason, what do the numbers mean?


In [17]:
training_data_df['price_per_m2'].quantile([0.05, 0.5, 0.95, 0.99])

Unnamed: 0,price_per_m2
0.05,1195.378151
0.5,2230.769231
0.95,4563.177372
0.99,5484.785175


#### What could be the problem? Model?

In [19]:
# Features and target
features = ['size', 'rooms', 'district', 'great_view', 'from_owner', 'under_priced', 'only_here']
target = 'price_per_m2'

X_train = training_data_df[features]
y_train = training_data_df[target]

X_test = testing_data_df[features]
y_test = testing_data_df[target]

# Preprocessing for categorical features
categorical_features = ['district', 'great_view', 'from_owner', 'under_priced', 'only_here']
numeric_features = ['size', 'rooms']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
    ],
    remainder='passthrough'
)

# Build pipeline with Linear Regression
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train on early data
model.fit(X_train, y_train)

# Predict on late data
y_pred = model.predict(X_test)

# Evaluate
from sklearn.metrics import r2_score, mean_absolute_error
print("R² on test data:", r2_score(y_test, y_pred))
print("MAE on test data:", mean_absolute_error(y_test, y_pred))

# Feature relevance (coefficients)
feature_names = preprocessor.get_feature_names_out()
coefs = model.named_steps['regressor'].coef_

coef_df = pd.DataFrame({'feature': feature_names, 'beta': coefs})
coef_df = coef_df.sort_values(by='beta', key=np.abs, ascending=False)
print(coef_df)


R² on test data: -0.003342758736341489
MAE on test data: 6141.995741401457
                feature         beta
13     cat__district_19 -1336.690461
3       cat__district_5  1156.536501
12     cat__district_18  -898.852398
20     cat__only_here_1  -882.718725
14     cat__district_20  -878.785331
2       cat__district_4  -864.314808
17    cat__great_view_1   803.524806
8      cat__district_10  -801.275721
0       cat__district_2   656.873308
16     cat__district_22  -625.192094
19  cat__under_priced_1   526.979115
15     cat__district_21  -491.445935
10     cat__district_12   313.225128
9      cat__district_11  -305.408416
18    cat__from_owner_1   274.377295
6       cat__district_8  -211.249944
11     cat__district_14  -206.952886
5       cat__district_7  -166.320496
7       cat__district_9  -156.670270
4       cat__district_6   155.516675
22     remainder__rooms    85.235030
1       cat__district_3    23.808182
21      remainder__size    -5.773537




#### Data!

In [25]:
training_data_df[['district','price_per_m2']].groupby('district').mean().reset_index().sort_values('price_per_m2',ascending=False)

Unnamed: 0,district,price_per_m2
4,5,3896.785142
1,2,3385.820207
5,6,2900.368836
0,1,2852.607821
11,12,2839.851126
2,3,2711.6202
8,9,2629.460744
6,7,2504.889903
12,14,2498.715835
7,8,2464.844654


In [35]:
training_data_df.groupby('district').size().reset_index(name='count').sort_values(by='count', ascending=False)


Unnamed: 0,district,count
10,11,19
3,4,17
8,9,17
9,10,17
5,6,16
4,5,16
11,12,16
0,1,15
1,2,15
2,3,15
