# Immoweb data analysis 

### Importing libraries

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder

### Load dataset

In [9]:
# Load the dataset
data_path = 'data/data_20240313_modified_minus1.csv'
df = pd.read_csv(data_path)

### Convert to pandas DataFrame for outlier removal

In [10]:
# Define the features to check for outliers
features_to_check = ['price', 'surface_land_sqm', 'total_area_sqm', 'nbr_bedrooms']

# Define the function to calculate IQR bounds and remove outliers
def remove_outliers(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]
    return df

# Remove outliers for specified features
for feature in features_to_check:
    df = remove_outliers(df, feature)

### One-hot encode categorical variables

In [11]:
df = pd.get_dummies(df, drop_first=True)  # drop_first=True to avoid dummy variable trap


### Split the data into training and testing sets

In [12]:
# Assuming 'price' is the target and all other columns are predictors
X = df.drop('price', axis=1)  
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Initialize the GBM model

In [13]:
gbm_model = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)

### Train the model

In [14]:
gbm_model.fit(X_train, y_train)

### Make predictions on the test set

In [15]:
predictions = gbm_model.predict(X_test)

### Calculate Evaluation Metrics

In [16]:
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

# Print the metrics
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R-squared (R2): {r2}')

Mean Squared Error (MSE): 4149535020.2383494
Root Mean Squared Error (RMSE): 64416.884589666006
Mean Absolute Error (MAE): 46169.193819367145
R-squared (R2): 0.7511648159441869


