# Immoweb data analysis 

### Importing libraries 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assuming data is already loaded
data = pd.read_csv('data/data_20240313_modified_2.csv')

# Calculate IQR for price to filter outliers
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out price outliers
filtered_data = data[(data['price'] >= lower_bound) & (data['price'] <= upper_bound)]

# Define features and target variable again just for clarity
X = filtered_data.drop(['price'], axis=1)
y = filtered_data['price']

# Identify numerical and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Define preprocessing for numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())])  # Standard scaling of numeric features

# Define preprocessing for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing values
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])  # OneHotEncode categorical features

# Combine preprocessing steps into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


### Building the Model
Now that preprocessing is set up, integrate this with a linear regression model in a pipeline.

In [2]:
# Define the model pipeline including the preprocessor and the model itself
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])


### Splitting the Data and Model Training
Split the data into training and testing sets, then train the model.

In [3]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now you can train the model
model.fit(X_train, y_train)

### Model Evaluation
Evaluate the model using metrics such as Mean Squared Error (MSE) and R-squared.

In [4]:

# And then proceed with prediction and evaluation as before
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")


Mean Squared Error: 10457119094.387495
Root Mean Squared Error: 102260.05620176186
Mean Absolute Error: 75646.52803579994
R^2 Score: 0.4943831379149993


