In [1]:
import pandas as pd
import numpy as np
import re
from matplotlib import pyplot as plt
import seaborn as sns


In [2]:
immo_dataset = pd.read_csv('data/fulldata.csv')

immo_dataset.head()


Unnamed: 0,locality,postal_code,region,property_type,subtype_property,price,contruction_year,type_of_sale,number_of_rooms,living_area,...,has_open_fire,has_terrace,terrace_area,has_garden,garden_surface,habitable_surface,plot_land_surface,number_of_facades,has_swimming_pool,building_state
0,Deinze,9800,Flanders,APARTMENT,APARTMENT,395000.0,2023.0,residential_sale,2,22.0,...,0,1,12.0,1,,99.0,,4.0,0,AS_NEW
1,Halle,1500,Flanders,HOUSE,HOUSE,385000.0,,residential_sale,3,,...,0,0,,0,,158.0,,,0,
2,Antwerp,2060,Flanders,APARTMENT,APARTMENT,140000.0,,residential_sale,1,,...,0,1,8.0,0,,44.0,,,0,
3,Antwerp,2018,Flanders,APARTMENT,APARTMENT,215000.0,,residential_sale,2,,...,0,0,,0,,75.0,,,0,
4,Edegem,2650,Flanders,APARTMENT,APARTMENT,349900.0,,residential_sale,2,,...,0,1,,0,,103.0,,,0,


In [3]:
immo_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11243 entries, 0 to 11242
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   locality               11243 non-null  object 
 1   postal_code            11243 non-null  object 
 2   region                 11171 non-null  object 
 3   property_type          11243 non-null  object 
 4   subtype_property       11243 non-null  object 
 5   price                  11242 non-null  float64
 6   contruction_year       2406 non-null   float64
 7   type_of_sale           11243 non-null  object 
 8   number_of_rooms        11243 non-null  int64  
 9   living_area            3438 non-null   float64
 10  kitchen_fully_equiped  3438 non-null   float64
 11  is_furnished           11243 non-null  int64  
 12  has_open_fire          11243 non-null  int64  
 13  has_terrace            11243 non-null  int64  
 14  terrace_area           4678 non-null   float64
 15  ha

Step 1: Data Cleaning


Remove spaces from all columns.

In [18]:
immo_dataset.select_dtypes(['object'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11243 entries, 0 to 11242
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   locality          11243 non-null  object
 1   postal_code       11243 non-null  object
 2   region            11171 non-null  object
 3   property_type     11243 non-null  object
 4   subtype_property  11243 non-null  object
 5   type_of_sale      11243 non-null  object
 6   building_state    3030 non-null   object
dtypes: object(7)
memory usage: 615.0+ KB


In [None]:
#remove space from all columns
immo_dataset_space_removed = immo_dataset.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
print(immo_dataset_space_removed)
immo_dataset['postal_code'] = immo_dataset['postal_code'].str.replace(' ', '')
print(immo_dataset['postal_code'].unique())

Remove letters from postal_code.


In [None]:
immo_dataset['postal_code'] = immo_dataset['postal_code'].str.replace('MK', '')
immo_dataset['postal_code'] = immo_dataset['postal_code'].str.replace('AN', '')
immo_dataset['postal_code'] = immo_dataset['postal_code'].str.replace('KA', '')
immo_dataset['postal_code'] = immo_dataset['postal_code'].str.replace('AG', '')

We dropped columns from main data because it had hight % of missing values.

In [None]:
missing_percentage = immo_dataset.isna().mean() * 100
print(missing_percentage)

In [None]:
drop_columns = ['locality', 'contruction_year', 'living_area', 'kitchen_fully_equiped', 'terrace_area', 'garden_surface', 'plot_land_surface', 'number_of_facades', 'building_state']
immo_dataset.drop(drop_columns , axis=1, inplace=True)
immo_dataset.head(10)


In [None]:
duplicate_rows = immo_dataset.duplicated()
duplicate_data = immo_dataset[duplicate_rows]

print(duplicate_data)

In [None]:
immo_dataset = immo_dataset.reset_index(drop=True)

In [None]:
immo_dataset.drop_duplicates(inplace=True)
immo_dataset.to_csv('immo_clean1.csv')

In [None]:
immo_dataset.info()

In [None]:
#Find empty values
empty_values = immo_dataset.isna()

#Count the number of empty values in each column
empty_count = empty_values.sum()

#Display the columns with empty values
print("Columns with empty values:")
print(empty_count[empty_count > 0])

Data Analysis

In [None]:
immo_dataset.shape

In [None]:
immo_dataset.info()


In [None]:
missing_percentage = immo_dataset.isna().mean() * 100
print(missing_percentage)

In [None]:
correlation_coefficient = immo_dataset['price'].corr(immo_dataset['habitable_surface'])
print(correlation_coefficient)

Find correlation between price and other variables

In [None]:
numeric_columns = ['postal_code', 'price', 'number_of_rooms',
                   'is_furnished', 'has_open_fire', 'has_terrace',
                   'has_garden', 'habitable_surface', 'has_swimming_pool']

categorical_columns = ['region', 'property_type', 'subtype_property', 'type_of_sale']
# Convert the categorical 'locality' column to numerical using one-hot encoding
encoded_data = pd.get_dummies(immo_dataset, columns=categorical_columns)

correlation_coefficient = encoded_data.corr()[['price']]

# Print the correlation coefficients

print(correlation_coefficient.sort_values(by=['price'], ascending=False))

In [None]:
#Select the correlation values of "price" column with other columns
price_correlation = correlation_coefficient['price'].drop('price')
print(price_correlation.sort_values(ascending=False))

In [None]:
# Assuming you have the correlation values stored in the 'price_correlation' variable

# Sort the correlation values in descending order
sorted_correlation = price_correlation.sort_values(ascending=False)

# Plotting the graph
plt.figure(figsize=(10, 6))
plt.bar(sorted_correlation.index, sorted_correlation.values)
plt.xlabel('Columns')
plt.ylabel('Correlation Coefficient')
plt.title('Correlation between Price and Other Columns')
plt.xticks(rotation=60)
plt.show()


How are variables correlated to each other? 

In [None]:
drop_columns_type = ['type_of_sale']
immo_dataset_no_type_of_sale = immo_dataset.drop(drop_columns_type, axis=1)
immo_dataset_no_type_of_sale.head(10)

In [None]:


# Assuming you have the dataset stored in the 'immo_dataset' variable

# Select the numeric columns for correlation analysis
#numeric_columns = immo_dataset.select_dtypes(include=['float64', 'int64']).columns
numeric_columns = ['postal_code', 'price', 'number_of_rooms',
                   'is_furnished', 'has_open_fire', 'has_terrace',
                   'has_garden', 'habitable_surface', 'has_swimming_pool']

categorical_columns = ['region', 'property_type', 'subtype_property']
# Convert the categorical 'locality' column to numerical using one-hot encoding
encoded_data = pd.get_dummies(immo_dataset_no_type_of_sale, columns=categorical_columns)

#correlation_coefficient = encoded_data.corr()

# Print the correlation coefficients

#print(correlation_coefficient)
# Calculate the correlation using Pearson method
pearson_corr = encoded_data.corr(method='pearson')

# Plotting the correlation heatmap
plt.figure(figsize=(16, 16))
sns.heatmap(pearson_corr, annot=True, cmap='coolwarm')
plt.title('Pearson Correlation Heatmap')
plt.show()

In [None]:
# Read the dataset into a pandas DataFrame
#data = pd.read_csv("your_dataset.csv")
# Select the numeric columns for correlation analysis
numeric_columns = immo_dataset.select_dtypes(include=['float64', 'int64']).columns

# Calculate the correlation using Pearson method
pearson_corr = immo_dataset[numeric_columns].corr(method='pearson')

# Calculate the correlation using Spearman method
#spearman_corr = immo_dataset[numeric_columns].corr(method='spearman')

# Print the correlation values
print("Pearson correlation:\n", pearson_corr)
#print("\nSpearman correlation:\n", spearman_corr)

In [None]:

numeric_columns = ['postal_code', 'price', 'number_of_rooms',
                   'is_furnished', 'has_open_fire', 'has_terrace',
                   'has_garden', 'habitable_surface', 'has_swimming_pool']

categorical_columns = ['region', 'property_type', 'subtype_property', 'type_of_sale']
# Convert the categorical 'locality' column to numerical using one-hot encoding
encoded_data = pd.get_dummies(immo_dataset, columns=categorical_columns)

correlation_coefficient = encoded_data.corr()

# Print the correlation coefficients

print(correlation_coefficient)

In [None]:


# Assuming you have the dataset stored in the 'immo_dataset' variable

# Select the numeric columns for correlation analysis
numeric_columns = immo_dataset.select_dtypes(include=['float64', 'int64']).columns

# Calculate the correlation using Pearson method
pearson_corr = immo_dataset[numeric_columns].corr(method='pearson')

# Plotting the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(pearson_corr, annot=True, cmap='coolwarm')
plt.title('Pearson Correlation Heatmap')
plt.show()

Which variables have the greatest influence on the price?

Which variables have the least influence on the price?

How many qualitative and quantitative variables are there? How would you transform these values into numerical values?

Percentage of missing values per column?

https://campus.datacamp.com/courses/data-manipulation-with-pandas/creating-and-visualizing-dataframes?ex=6

https://campus.datacamp.com/courses/dealing-with-missing-data-in-python/the-problem-with-missing-data?ex=5

In [None]:
missing_percentage = immo_dataset.isna().mean() * 100
print(missing_percentage)

Step 3 : Data Interpretation

Plot the outliers.

In [None]:
#Convert 'price' column to Pandas Series
price_series = pd.Series(immo_dataset['price'])

#Create the scatter plot
plt.scatter(range(len(price_series)), price_series)

#Set plot title and labels
plt.title('Scatter plot for Price Variation')
plt.xlabel('Listing Index')
plt.ylabel('Price')

#Display the scatter plot
plt.show()

In [None]:
from scipy import stats

z_scores = stats.zscore(price_series)
threshold = 3  # Choose an appropriate threshold for identifying outliers

outliers = price_series[abs(z_scores) > threshold]

In [None]:
price_series = pd.Series(immo_dataset['price'])
##Box plot: A box plot provides a visual representation of the distribution of a dataset, including information about outliers. You can use the boxplot() function from either Matplotlib or Seaborn to create a box plot of the 'price' column.
plt.boxplot(price_series)
plt.title('Box Plot for Price')
plt.ylabel('Price')
plt.show()

Which variables would you delete and why ?


Represent the number of properties according to their surface using a histogram.


In [None]:
# Extract the 'habitable_surface' data
surface = immo_dataset['habitable_surface']

# Determine the bin intervals
#bin_width = 50
#bins = range(0, int(max(surface)) + bin_width, bin_width)

# Create the histogram
plt.hist(surface, bins=50, edgecolor='black')

# Label the axes and add a title
plt.xlabel('Surface (sq. meters)')
plt.ylabel('Number of Properties')
plt.title('Number of properties according to their surface')

# Display the histogram
plt.show()



In your opinion, which 5 variables are the most important and why?


What are the most expensive municipalities in Belgium? (Average price, median price, price per square meter)


What are the most expensive municipalities in Wallonia? (Average price, median price, price per square meter)


What are the most expensive municipalities in Flanders? (Average price, median price, price per square meter)


What are the less expensive municipalities in Belgium? (Average price, median price, price per square meter)


What are the less expensive municipalities in Wallonia? (Average price, median price, price per square meter)


What are the less expensive municipalities in Flanders? (Average price, median price, price per square meter)
