In [2]:
#Check current working directory and import packages
import os
wd = os.getcwd() 
print(wd)

import sqlalchemy
from sqlalchemy import create_engine
import pandas as pd
import psycopg2
import plotly.express as px
import datetime

/Users/Marcel/Desktop/Master Digital Driven Business/Database Management and Digital Tools/Funda


Create postgres connection and read in tables

In [4]:
#Create database connection and read tables
engine = create_engine('postgresql://localhost/funda')

#Read table houses from Database
houses = pd.read_sql_table(
    "houses",
    con=engine
)

#Change value 'NULL' to 0 in order to change column to numeric
houses.loc[houses['koopprijs'] == 'NULL', 'koopprijs'] = 0

#Set column koopprijs as numeric
houses['koopprijs'] = pd.to_numeric(houses['koopprijs'])

#Change columns to datetime
houses['datum_ondertekening'] = pd.to_datetime(houses['datum_ondertekening'], format='%Y-%m-%d')
houses['publicatiedatum'] = pd.to_datetime(houses['publicatiedatum'], format='%Y-%m-%d')

#Create a new column with the name of the selling month
houses['SellingMonth'] = houses['datum_ondertekening'].dt.strftime('%m') #Month number


#Read table municipalities from Database
municipalities = pd.read_sql_table(
    "municipalities",
    con=engine
)

#Read table districts from Database
districts = pd.read_sql_table(
    "districts",
    con=engine
)

#Read table neighborhoods from Database
neighborhoods = pd.read_sql_table(
    "neighborhoods",
    con=engine
)

#Read table zipcodes from Database
zipcodes = pd.read_sql_table(
    "zipcodes",
    con=engine
)


#Read table energy from Database
energy = pd.read_sql_table(
    "energy",
    con=engine
)

#Remove white space from columns (spaces after/before value)
energy['soortregio_2'] = energy['soortregio_2'].str.strip()
energy['wijkenenbuurten'] = energy['wijkenenbuurten'].str.strip()
energy['gemeentenaam_1'] = energy['gemeentenaam_1'].str.strip()
energy['gemiddeldelektriciteitsverbruiktotaal_47'] = energy['gemiddeldelektriciteitsverbruiktotaal_47'].str.strip()
energy['gemiddeldaardgasverbruiktotaal_55'] = energy['gemiddeldaardgasverbruiktotaal_55'].str.strip()


#Read table crime from Database
crime = pd.read_sql_table(
    "crime",
    con=engine
)


#Remove white space from columns (spaces after value)
crime['soortregio_2'] = crime['soortregio_2'].str.strip()
crime['wijkenenbuurten'] = crime['wijkenenbuurten'].str.strip()
crime['gemeentenaam_1'] = crime['gemeentenaam_1'].str.strip()
crime['totaaldiefstaluitwoningschuured_78'] = crime['totaaldiefstaluitwoningschuured_78'].str.strip()
crime['vernielingmisdrijftegenopenbareorde_79'] = crime['vernielingmisdrijftegenopenbareorde_79'].str.strip()
crime['geweldsenseksuelemisdrijven_80'] = crime['geweldsenseksuelemisdrijven_80'].str.strip()

#Remove rows with '.'
crime = crime.loc[(crime['totaaldiefstaluitwoningschuured_78'] != '.')]
crime = crime.loc[(crime['vernielingmisdrijftegenopenbareorde_79'] != '.')]
crime = crime.loc[(crime['geweldsenseksuelemisdrijven_80'] != '.')]


#Columns to numeric
crime['totaaldiefstaluitwoningschuured_78'] = pd.to_numeric(crime['totaaldiefstaluitwoningschuured_78'])
crime['vernielingmisdrijftegenopenbareorde_79'] = pd.to_numeric(crime['vernielingmisdrijftegenopenbareorde_79'])
crime['geweldsenseksuelemisdrijven_80'] = pd.to_numeric(crime['geweldsenseksuelemisdrijven_80'])

#Add a new column with total Violations
crime['Total Violations'] = crime['totaaldiefstaluitwoningschuured_78'] + crime['vernielingmisdrijftegenopenbareorde_79'] + crime['geweldsenseksuelemisdrijven_80']



#Read table demographic from Database
distances = pd.read_sql_table(
    "distances",
    con=engine
)

#Remove white space from columns (spaces after value)
distances['wijkenenbuurten'] = distances['wijkenenbuurten'].str.strip()
distances['gemeentenaam_1'] = distances['gemeentenaam_1'].str.strip()
distances['soortregio_2'] = distances['soortregio_2'].str.strip()
distances['afstandtothuisartsenpraktijk_95'] = distances['afstandtothuisartsenpraktijk_95'].str.strip()
distances['afstandtotgrotesupermarkt_96'] = distances['afstandtotgrotesupermarkt_96'].str.strip()
distances['afstandtotkinderdagverblijf_97'] = distances['afstandtotkinderdagverblijf_97'].str.strip()
distances['afstandtotschool_98'] = distances['afstandtotschool_98'].str.strip()
distances['scholenbinnen3km_99'] = distances['scholenbinnen3km_99'].str.strip()
distances['matevanstedelijkheid_105'] = distances['matevanstedelijkheid_105'].str.strip()
distances['omgevingsadressendichtheid_106'] = distances['omgevingsadressendichtheid_106'].str.strip()

#Remove rows with '.'
distances = distances.loc[(distances['afstandtothuisartsenpraktijk_95'] != '.')]
distances = distances.loc[(distances['afstandtotgrotesupermarkt_96'] != '.')]
distances = distances.loc[(distances['afstandtotkinderdagverblijf_97'] != '.')]
distances = distances.loc[(distances['afstandtotschool_98'] != '.')]
distances = distances.loc[(distances['scholenbinnen3km_99'] != '.')]
distances = distances.loc[(distances['matevanstedelijkheid_105'] != '.')]
distances = distances.loc[(distances['omgevingsadressendichtheid_106'] != '.')]

#Columns to numeric
distances['afstandtothuisartsenpraktijk_95'] = pd.to_numeric(distances['afstandtothuisartsenpraktijk_95'])
distances['afstandtotgrotesupermarkt_96'] = pd.to_numeric(distances['afstandtotgrotesupermarkt_96'])
distances['afstandtotkinderdagverblijf_97'] = pd.to_numeric(distances['afstandtotkinderdagverblijf_97'])
distances['afstandtotschool_98'] = pd.to_numeric(distances['afstandtotschool_98'])
distances['scholenbinnen3km_99'] = pd.to_numeric(distances['scholenbinnen3km_99'])
distances['matevanstedelijkheid_105'] = pd.to_numeric(distances['matevanstedelijkheid_105'])
distances['omgevingsadressendichtheid_106'] = pd.to_numeric(distances['omgevingsadressendichtheid_106'])



#Read table age_groups from Database
age_groups = pd.read_sql_table(
    "age_groups",
    con=engine
)

#Remove white space from columns (spaces after value)
age_groups['soortregio_2'] = age_groups['soortregio_2'].str.strip()
age_groups['wijkenenbuurten'] = age_groups['wijkenenbuurten'].str.strip()
age_groups['gemeentenaam_1'] = age_groups['gemeentenaam_1'].str.strip()
age_groups['k_0tot15jaar_8'] = age_groups['k_0tot15jaar_8'].str.strip()
age_groups['k_15tot25jaar_9'] = age_groups['k_15tot25jaar_9'].str.strip()
age_groups['k_25tot45jaar_10'] = age_groups['k_25tot45jaar_10'].str.strip()
age_groups['k_45tot65jaar_11'] = age_groups['k_45tot65jaar_11'].str.strip()
age_groups['k_65jaarofouder_12'] = age_groups['k_65jaarofouder_12'].str.strip()

#Columns to numeric
age_groups['k_0tot15jaar_8'] = pd.to_numeric(age_groups['k_0tot15jaar_8'])
age_groups['k_15tot25jaar_9'] = pd.to_numeric(age_groups['k_15tot25jaar_9'])
age_groups['k_25tot45jaar_10'] = pd.to_numeric(age_groups['k_25tot45jaar_10'])
age_groups['k_45tot65jaar_11'] = pd.to_numeric(age_groups['k_45tot65jaar_11'])
age_groups['k_65jaarofouder_12'] = pd.to_numeric(age_groups['k_65jaarofouder_12'])


#Read table economics from Database
economic = pd.read_sql_table( #Only on municipality, to many missing values for districts and neighborhoods
    "economic",
    con=engine
)

#Remove whitespace
economic['wijkenenbuurten'] = economic ['wijkenenbuurten'].str.strip()
economic['gemeentenaam_1'] = economic['gemeentenaam_1'].str.strip()
economic['soortregio_2'] = economic['soortregio_2'].str.strip()
economic['aantalinkomensontvangers_64'] = economic['aantalinkomensontvangers_64'].str.strip()
economic['gemiddeldinkomenperinkomensontvanger_65'] = economic['gemiddeldinkomenperinkomensontvanger_65'].str.strip()
economic['gemiddeldinkomenperinwoner_66'] = economic['gemiddeldinkomenperinwoner_66'].str.strip()
economic['actieven1575jaar_69'] = economic['actieven1575jaar_69'].str.strip()

#Remove rows with '.'
economic = economic.loc[(economic['gemiddeldinkomenperinkomensontvanger_65'] != '.')]
economic = economic.loc[(economic['gemiddeldinkomenperinwoner_66'] != '.')]

#Select relevant columns
economic = economic[['wijkenenbuurten','gemeentenaam_1','soortregio_2','gemiddeldinkomenperinwoner_66']]

#Filter the information on Gemeente
economic = economic.loc[(economic['soortregio_2'] == 'Gemeente')]



#Read table population_density from Database
population_density = pd.read_sql_table(
    "population_density",
    con=engine
)

#Remove white space from columns (spaces after value)
population_density['soortregio_2'] = population_density['soortregio_2'].str.strip()
population_density['wijkenenbuurten'] = population_density['wijkenenbuurten'].str.strip()
population_density['gemeentenaam_1'] = population_density['gemeentenaam_1'].str.strip()

#remove "." from population_density df
population_density['bevolkingsdichtheid_33'] = population_density['bevolkingsdichtheid_33'].str.strip()
population_density.loc[population_density['bevolkingsdichtheid_33'] == '.', 'bevolkingsdichtheid_33'] = 0

#Columns to numeric
population_density['bevolkingsdichtheid_33'] = pd.to_numeric(population_density['bevolkingsdichtheid_33'])



#Read table cbshousing2018 from Database
cbshousing2018 = pd.read_sql_table(
    "cbshousing2018",
    con=engine
)

#Remove white space from columns (spaces after value)
cbshousing2018['wijkenenbuurten'] = cbshousing2018['wijkenenbuurten'].str.strip()
cbshousing2018['gemeentenaam_1'] = cbshousing2018['gemeentenaam_1'].str.strip()
cbshousing2018['soortregio_2'] = cbshousing2018['soortregio_2'].str.strip()
cbshousing2018['woningvoorraad_34'] = cbshousing2018['woningvoorraad_34'].str.strip()
cbshousing2018['gemiddeldewoningwaarde_35'] = cbshousing2018['gemiddeldewoningwaarde_35'].str.strip()
cbshousing2018['bouwjaarvoor2000_45'] = cbshousing2018['bouwjaarvoor2000_45'].str.strip()
cbshousing2018['bouwjaarvanaf2000_46'] = cbshousing2018['bouwjaarvanaf2000_46'].str.strip()

#Remove rows with '.'
cbshousing2018 = cbshousing2018.loc[(cbshousing2018['woningvoorraad_34'] != '.')]
cbshousing2018 = cbshousing2018.loc[(cbshousing2018['gemiddeldewoningwaarde_35'] != '.')]
cbshousing2018 = cbshousing2018.loc[(cbshousing2018['bouwjaarvoor2000_45'] != '.')]
cbshousing2018 = cbshousing2018.loc[(cbshousing2018['bouwjaarvanaf2000_46'] != '.')]

#Columns to numeric
cbshousing2018['woningvoorraad_34'] = pd.to_numeric(cbshousing2018['woningvoorraad_34'])
cbshousing2018['gemiddeldewoningwaarde_35'] = pd.to_numeric(cbshousing2018['gemiddeldewoningwaarde_35'])
cbshousing2018['bouwjaarvoor2000_45'] = pd.to_numeric(cbshousing2018['bouwjaarvoor2000_45'])
cbshousing2018['bouwjaarvanaf2000_46'] = pd.to_numeric(cbshousing2018['bouwjaarvanaf2000_46'])

Join municipality name, district name and neighborhood name to zipcodes. Delete duplicate values (housenumbers)

In [42]:
#Select only pc6, buurt2019, wijk2019 and gemeente2019 from zipcodes
zipcodes2 = zipcodes[['pc6', 'gemeente2019', 'wijk2019', 'buurt2019']]

#Change name of pc6 column to postcode
zipcodes2.columns.values[0] = "postcode" 

#Remove duplicate values (Duplicates exist because of leaving out house number column)
zipcodes2 = zipcodes2.drop_duplicates(subset=None, keep="first", inplace=False)

#Join datasets zipcodes2, municipalities, districts and neighborhoods
zipp = pd.merge(zipcodes2, municipalities, left_on = 'gemeente2019', right_on = 'gemcode2019')
zipp = pd.merge(zipp, districts, left_on = 'wijk2019', right_on = 'wijkcode2019')
zipp = pd.merge(zipp, neighborhoods, left_on = 'buurt2019', right_on = 'buurtcode_2019')

#Select useful columns in right order
zipp = zipp[['postcode', 'gemeente2019', 'gemeentenaam2019', 'wijk2019', 'wijknaam_2019k_naam', 'buurt2019', 'buurtnaam_2019']]

zipp.to_sql('ZipcodesComplete', engine) ######################



In [43]:
#merge houses with zipp
housescomplete = pd.merge(houses, zipp, on = 'postcode', how = 'inner')

#Drop duplicates
housescomplete = housescomplete.drop_duplicates(subset='globalid', keep="first", inplace=False)

housescomplete.to_sql('HousesMuncDistNeigh', engine) ######################

In [44]:
#Get number of digits in gemeente2019, wijk2019, buurt2019

#New dataframe with columns of the length of the codes
columns = ['LengthOfGemeentecode','LengthOfWijkcode', 'LengthOfBuurtcode']
codes = pd.DataFrame(columns=columns)

codes['LengthOfGemeentecode'] = zipp['gemeente2019'].astype(str).str.len()
codes['LengthOfWijkcode'] = zipp['wijk2019'].astype(str).str.len()
codes['LengthOfBuurtcode'] = zipp['buurt2019'].astype(str).str.len()

#Get the minimum and the maximum length
mingc = min(codes['LengthOfGemeentecode'])
maxgc = max(codes['LengthOfGemeentecode'])
mindc = min(codes['LengthOfWijkcode'])
maxdc = max(codes['LengthOfWijkcode'])
minnc = min(codes['LengthOfBuurtcode'])
maxnc = max(codes['LengthOfBuurtcode'])

#Print results
print(mingc, maxgc, mindc, maxdc, minnc, maxnc) ##Gemeentecode can consist of 1-4 digits
                                                ##Wijkcode can consist of 3-6 digits
                                                ##Buurtcode can consist of 5-8 digits

1 4 3 6 5 8


Extract gemeentecode, wijkcode and buurtcode from energy, crime, distances, age_groups, cbshousing2018 and population_density and merge with zipp:

This will result in clean tables with the municipality name, district name, neighborhood name and values in the munic, district and neighborhood
These tables can be used to compare a neighborhood to the district and the municipality that it's in.

In [46]:
#Municipality code can consist of 1-4 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Gemeentecode with 4 digits
energy['gemcode'] = energy['wijkenenbuurten'].str[-4:] ##.str[-4:] selects the last for characters from a column
energygc = energy.loc[(energy['soortregio_2'] == 'Gemeente')] 

#Gemeentecode with 3 digits
energy['gemcode'] = energy['wijkenenbuurten'].str[-3:]
energygc1 = energy.loc[(energy['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 2 digits
energy['gemcode'] = energy['wijkenenbuurten'].str[-2:]
energygc2 = energy.loc[(energy['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 1 digits
energy['gemcode'] = energy['wijkenenbuurten'].str[-1:]
energygc3 = energy.loc[(energy['soortregio_2'] == 'Gemeente')]

#Merge with concat (all column names are the same)
energygcc = pd.concat([energygc, energygc1, energygc2, energygc3])

#Select usable columns
energygcc = energygcc[['gemcode', 'gemiddeldelektriciteitsverbruiktotaal_47', 'gemiddeldaardgasverbruiktotaal_55']]

#Drop the duplicates ()
energygcc = energygcc.drop_duplicates(subset='gemcode', keep="first", inplace=False)

#Change column names
energygcc.columns.values[0] = "gemcode" #Change colname to english
energygcc.columns.values[1] = "Avg Electricity Consumption Municipality" #Change colname to english
energygcc.columns.values[2] = "Avg Natural Gas Consumption Municipality" #Change colname to english


#District code can consist of 3 - 6 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Wijkcode with 6 digits
energy['wijkcode'] = energy['wijkenenbuurten'].str[-6:]
energywc = energy.loc[(energy['soortregio_2'] == 'Wijk')] 

#Wijkcode with 5 digits
energy['wijkcode'] = energy['wijkenenbuurten'].str[-5:]
energywc1 = energy.loc[(energy['soortregio_2'] == 'Wijk')]

#Wijkcode with 4 digits
energy['wijkcode'] = energy['wijkenenbuurten'].str[-4:]
energywc2 = energy.loc[(energy['soortregio_2'] == 'Wijk')]

#Wijkcode with 3 digits
energy['wijkcode'] = energy['wijkenenbuurten'].str[-3:]
energywc3 = energy.loc[(energy['soortregio_2'] == 'Wijk')]

#Merge with concat (all column names are the same)
energywcc = pd.concat([energywc, energywc1, energywc2, energywc3])

#Select usable columns
energywcc = energywcc[['wijkcode', 'gemiddeldelektriciteitsverbruiktotaal_47', 'gemiddeldaardgasverbruiktotaal_55']]

#Drop the duplicates ()
energywcc = energywcc.drop_duplicates(subset='wijkcode', keep="first", inplace=False)

#Change column names
energywcc.columns.values[0] = "wijkcode" #Change colname to english
energywcc.columns.values[1] = "Avg Electricity Consumption District" #Change colname to english
energywcc.columns.values[2] = "Avg Natural Gas Consumption District" #Change colname to english


#Neighborhood code can consist of 5 - 8 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Buurtcodes with 8 digits
energy['buurtcode'] = energy['wijkenenbuurten'].str[-8:]
energybc = energy.loc[(energy['soortregio_2'] == 'Buurt')] 

#Buurtcodes with 7 digits
energy['buurtcode'] = energy['wijkenenbuurten'].str[-7:]
energybc1 = energy.loc[(energy['soortregio_2'] == 'Buurt')]

#Buurtcodes with 6 digits
energy['buurtcode'] = energy['wijkenenbuurten'].str[-6:]
energybc2 = energy.loc[(energy['soortregio_2'] == 'Buurt')]

#Buurtcodes with 5 digits
energy['buurtcode'] = energy['wijkenenbuurten'].str[-5:]
energybc3 = energy.loc[(energy['soortregio_2'] == 'Buurt')]

#Merge with concat (all column names are the same)
energybcc = pd.concat([energybc, energybc1, energybc2, energybc3])

#Select usable columns
energybcc = energybcc[['buurtcode', 'gemiddeldelektriciteitsverbruiktotaal_47', 'gemiddeldaardgasverbruiktotaal_55']]

#Change column names
energybcc.columns.values[0] = "buurtcode" #Change colname to english
energybcc.columns.values[1] = "Avg Electricity Consumption Neighborhood" #Change colname to english
energybcc.columns.values[2] = "Avg Natural Gas Consumption Neighborhood" #Change colname to english

#Drop the duplicates ()
energybcc = energybcc.drop_duplicates(subset='buurtcode', keep="first", inplace=False)

#Perform joins 
energyzip = pd.merge(zipp, energygcc, left_on = 'gemeente2019', right_on = 'gemcode', how = 'inner')  
energyzip = pd.merge(energyzip, energywcc, left_on = 'wijk2019', right_on = 'wijkcode', how = 'inner')  ##
energyzip = pd.merge(energyzip, energybcc, left_on = 'buurt2019', right_on = 'buurtcode', how = 'inner') 

#Overview of dataset 
#energyzip #Containing 453185 rows, zipp has 488315 rows -> 35130 rows (postcodes) lost because of missing data.

#Clean column names for energyzip

#Select columns
EnergyZipClean = energyzip[['postcode', 'gemeentenaam2019', 'wijknaam_2019k_naam', 'buurtnaam_2019', 
                              'Avg Electricity Consumption Municipality', 'Avg Natural Gas Consumption Municipality',
                              'Avg Electricity Consumption District', 'Avg Natural Gas Consumption District',
                              'Avg Electricity Consumption Neighborhood', 'Avg Natural Gas Consumption Neighborhood']]

#Change colnames
EnergyZipClean.columns.values[0] = "Postcode"
EnergyZipClean.columns.values[1] = "Municipality"
EnergyZipClean.columns.values[2] = "District"
EnergyZipClean.columns.values[3] = "Neighborhood"

#Overview
EnergyZipClean.to_sql('EnergyGasZipClean', engine) #453185, 488315 rows in zipcodes -> 35130 zipcodes lost because of missing data.

In [47]:
#Municipality code can consist of 1-4 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Gemeentecode with 4 digits
crime['gemcode'] = crime['wijkenenbuurten'].str[-4:] ##.str[-4:] selects the last for characters from a column
crimegc = crime.loc[(crime['soortregio_2'] == 'Gemeente')] 

#Gemeentecode with 3 digits
crime['gemcode'] = crime['wijkenenbuurten'].str[-3:]
crimegc1 = crime.loc[(crime['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 2 digits
crime['gemcode'] = crime['wijkenenbuurten'].str[-2:]
crimegc2 = crime.loc[(crime['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 1 digits
crime['gemcode'] = crime['wijkenenbuurten'].str[-1:]
crimegc3 = crime.loc[(crime['soortregio_2'] == 'Gemeente')]

#Merge with concat (all column names are the same)
crimegcc = pd.concat([crimegc, crimegc1, crimegc2, crimegc3])

#Select usable columns
crimegcc = crimegcc[['gemcode', 'totaaldiefstaluitwoningschuured_78', 'vernielingmisdrijftegenopenbareorde_79', 'geweldsenseksuelemisdrijven_80', 'Total Violations']]

#Drop the duplicates ()
crimegcc = crimegcc.drop_duplicates(subset='gemcode', keep="first", inplace=False)

#Change column names
crimegcc.columns.values[0] = "gemcode" #Change colname to english
crimegcc.columns.values[1] = "Avg Theft From Barns Municipality" #Change colname to english
crimegcc.columns.values[2] = "Avg Crime Against Public Order Municipality" #Change colname to english
crimegcc.columns.values[3] = "Avg Crime Assault and Sexuality Municipality" #Change colname to english
crimegcc.columns.values[4] = "Avg Total Violations Municipality" #Change colname to english


#District code can consist of 3 - 6 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Wijkcode with 6 digits
crime['wijkcode'] = crime['wijkenenbuurten'].str[-6:]
crimewc = crime.loc[(crime['soortregio_2'] == 'Wijk')] 

#Wijkcode with 5 digits
crime['wijkcode'] = crime['wijkenenbuurten'].str[-5:]
crimewc1 = crime.loc[(crime['soortregio_2'] == 'Wijk')]

#Wijkcode with 4 digits
crime['wijkcode'] = crime['wijkenenbuurten'].str[-4:]
crimewc2 = crime.loc[(crime['soortregio_2'] == 'Wijk')]

#Wijkcode with 3 digits
crime['wijkcode'] = crime['wijkenenbuurten'].str[-3:]
crimewc3 = crime.loc[(crime['soortregio_2'] == 'Wijk')]

#Merge with concat (all column names are the same)
crimewcc = pd.concat([crimewc, crimewc1, crimewc2, crimewc3])

#Select usable columns
crimewcc = crimewcc[['wijkcode', 'totaaldiefstaluitwoningschuured_78', 'vernielingmisdrijftegenopenbareorde_79', 'geweldsenseksuelemisdrijven_80', 'Total Violations']]

#Drop the duplicates ()
crimewcc = crimewcc.drop_duplicates(subset='wijkcode', keep="first", inplace=False)

#Change column names
crimewcc.columns.values[0] = "wijkcode" #Change colname to english
crimewcc.columns.values[1] = "Avg Theft From Barns District" #Change colname to english
crimewcc.columns.values[2] = "Avg Crime Against Public Order District" #Change colname to english
crimewcc.columns.values[3] = "Avg Crime Assault and Sexuality District" #Change colname to english
crimewcc.columns.values[4] = "Avg Total Violations District" #Change colname to english


#Neighborhood code can consist of 5 - 8 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Buurtcodes with 8 digits
crime['buurtcode'] = crime['wijkenenbuurten'].str[-8:]
crimebc = crime.loc[(crime['soortregio_2'] == 'Buurt')] 

#Buurtcodes with 7 digits
crime['buurtcode'] = crime['wijkenenbuurten'].str[-7:]
crimebc1 = crime.loc[(crime['soortregio_2'] == 'Buurt')]

#Buurtcodes with 6 digits
crime['buurtcode'] = crime['wijkenenbuurten'].str[-6:]
crimebc2 = crime.loc[(crime['soortregio_2'] == 'Buurt')]

#Buurtcodes with 5 digits
crime['buurtcode'] = crime['wijkenenbuurten'].str[-5:]
crimebc3 = crime.loc[(crime['soortregio_2'] == 'Buurt')]

#Merge with concat (all column names are the same)
crimebcc = pd.concat([crimebc, crimebc1, crimebc2, crimebc3])

#Select usable columns
crimebcc = crimebcc[['buurtcode', 'totaaldiefstaluitwoningschuured_78', 'vernielingmisdrijftegenopenbareorde_79', 'geweldsenseksuelemisdrijven_80', 'Total Violations']]

#Change column names
crimebcc.columns.values[0] = "buurtcode" #Change colname to english
crimebcc.columns.values[1] = "Avg Theft From Barns Neighborhood" #Change colname to english
crimebcc.columns.values[2] = "Avg Crime Against Public Order Neighborhood" #Change colname to english
crimebcc.columns.values[3] = "Avg Crime Assault and Sexuality Neighborhood" #Change colname to english
crimebcc.columns.values[4] = "Avg Total Violations Neighborhood" #Change colname to english

#Drop the duplicates ()
crimebcc = crimebcc.drop_duplicates(subset='buurtcode', keep="first", inplace=False)

#Perform joins 
crimezip = pd.merge(zipp, crimegcc, left_on = 'gemeente2019', right_on = 'gemcode', how = 'inner')  
crimezip = pd.merge(crimezip, crimewcc, left_on = 'wijk2019', right_on = 'wijkcode', how = 'inner')  ##
crimezip = pd.merge(crimezip, crimebcc, left_on = 'buurt2019', right_on = 'buurtcode', how = 'inner') 

#Overview of dataset 
#energyzip #Containing 453185 rows, zipp has 488315 rows -> 35130 rows (postcodes) lost because of missing data.

#Clean column names for energyzip

#Select columns
CrimeZipClean = crimezip[['postcode', 'gemeentenaam2019', 'wijknaam_2019k_naam', 'buurtnaam_2019', 
                          'Avg Theft From Barns Municipality', 'Avg Crime Against Public Order Municipality', 'Avg Crime Assault and Sexuality Municipality' , 'Avg Total Violations Municipality',
                          'Avg Theft From Barns District', 'Avg Crime Against Public Order District', 'Avg Crime Assault and Sexuality District' , 'Avg Total Violations District',
                          'Avg Theft From Barns Neighborhood', 'Avg Crime Against Public Order Neighborhood', 'Avg Crime Assault and Sexuality Neighborhood' , 'Avg Total Violations Neighborhood'
                         ]]

#Overview
CrimeZipClean.to_sql('CrimeZipClean', engine) #447447, 488315 rows in zipcodes -> 40868 zipcodes lost because of missing data.

In [48]:
#Municipality code can consist of 1-4 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Gemeentecode with 4 digits
distances['gemcode'] = distances['wijkenenbuurten'].str[-4:] ##.str[-4:] selects the last for characters from a column
distancesgc = distances.loc[(distances['soortregio_2'] == 'Gemeente')] 

#Gemeentecode with 3 digits
distances['gemcode'] = distances['wijkenenbuurten'].str[-3:]
distancesgc1 = distances.loc[(distances['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 2 digits
distances['gemcode'] = distances['wijkenenbuurten'].str[-2:]
distancesgc2 = distances.loc[(distances['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 1 digits
distances['gemcode'] = distances['wijkenenbuurten'].str[-1:]
distancesgc3 = distances.loc[(distances['soortregio_2'] == 'Gemeente')]

#Merge with concat (all column names are the same)
distancesgcc = pd.concat([distancesgc, distancesgc1, distancesgc2, distancesgc3])

#Select usable columns
distancesgcc = distancesgcc[['gemcode', 'afstandtothuisartsenpraktijk_95', 'afstandtotgrotesupermarkt_96', 'afstandtotkinderdagverblijf_97', 'afstandtotschool_98',
                             'scholenbinnen3km_99', 'matevanstedelijkheid_105', 'omgevingsadressendichtheid_106']]

#Drop the duplicates ()
distancesgcc = distancesgcc.drop_duplicates(subset='gemcode', keep="first", inplace=False)

#Change column names
distancesgcc.columns.values[0] = "gemcode" #Change colname to english
distancesgcc.columns.values[1] = "Avg Distance to General Practice Municipality" #Change colname to english
distancesgcc.columns.values[2] = "Avg Distance to Grocery Store Municipality" #Change colname to english
distancesgcc.columns.values[3] = "Avg Distance to Daycare Municipality" #Change colname to english
distancesgcc.columns.values[4] = "Avg Distance to School Municipality" #Change colname to english
distancesgcc.columns.values[5] = "Avg Schools Within 3KM Municipality" #Change colname to english
distancesgcc.columns.values[6] = "Avg Degree of Urbanity Municipality" #Change colname to english
distancesgcc.columns.values[7] = "Avg Environmental Address Density Municipality" #Change colname to english

#District code can consist of 3 - 6 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Wijkcode with 6 digits
distances['wijkcode'] = distances['wijkenenbuurten'].str[-6:]
distanceswc = distances.loc[(distances['soortregio_2'] == 'Wijk')] 

#Wijkcode with 5 digits
distances['wijkcode'] = distances['wijkenenbuurten'].str[-5:]
distanceswc1 = distances.loc[(distances['soortregio_2'] == 'Wijk')]

#Wijkcode with 4 digits
distances['wijkcode'] = distances['wijkenenbuurten'].str[-4:]
distanceswc2 = distances.loc[(distances['soortregio_2'] == 'Wijk')]

#Wijkcode with 3 digits
distances['wijkcode'] = distances['wijkenenbuurten'].str[-3:]
distanceswc3 = distances.loc[(distances['soortregio_2'] == 'Wijk')]

#Merge with concat (all column names are the same)
distanceswcc = pd.concat([distanceswc, distanceswc1, distanceswc2, distanceswc3])

#Select usable columns
distanceswcc = distanceswcc[['wijkcode', 'afstandtothuisartsenpraktijk_95', 'afstandtotgrotesupermarkt_96', 'afstandtotkinderdagverblijf_97', 'afstandtotschool_98',
                             'scholenbinnen3km_99', 'matevanstedelijkheid_105', 'omgevingsadressendichtheid_106']]

#Drop the duplicates ()
distanceswcc = distanceswcc.drop_duplicates(subset='wijkcode', keep="first", inplace=False)

#Change column names
distanceswcc.columns.values[0] = "wijkcode" #Change colname to english
distanceswcc.columns.values[1] = "Avg Distance to General Practice District" #Change colname to english
distanceswcc.columns.values[2] = "Avg Distance to Grocery Store District" #Change colname to english
distanceswcc.columns.values[3] = "Avg Distance to Daycare District" #Change colname to english
distanceswcc.columns.values[4] = "Avg Distance to School District" #Change colname to english
distanceswcc.columns.values[5] = "Avg Schools Within 3KM District" #Change colname to english
distanceswcc.columns.values[6] = "Avg Degree of Urbanity District" #Change colname to english
distanceswcc.columns.values[7] = "Avg Environmental Address Density District" #Change colname to english

#Neighborhood code can consist of 5 - 8 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Buurtcodes with 8 digits
distances['buurtcode'] = distances['wijkenenbuurten'].str[-8:]
distancesbc = distances.loc[(distances['soortregio_2'] == 'Buurt')] 

#Buurtcodes with 7 digits
distances['buurtcode'] = distances['wijkenenbuurten'].str[-7:]
distancesbc1 = distances.loc[(distances['soortregio_2'] == 'Buurt')]

#Buurtcodes with 6 digits
distances['buurtcode'] = distances['wijkenenbuurten'].str[-6:]
distancesbc2 = distances.loc[(distances['soortregio_2'] == 'Buurt')]

#Buurtcodes with 5 digits
distances['buurtcode'] = distances['wijkenenbuurten'].str[-5:]
distancesbc3 = distances.loc[(distances['soortregio_2'] == 'Buurt')]

#Merge with concat (all column names are the same)
distancesbcc = pd.concat([distancesbc, distancesbc1, distancesbc2, distancesbc3])

#Select usable columns
distancesbcc = distancesbcc[['buurtcode', 'afstandtothuisartsenpraktijk_95', 'afstandtotgrotesupermarkt_96', 'afstandtotkinderdagverblijf_97', 'afstandtotschool_98',
                             'scholenbinnen3km_99', 'matevanstedelijkheid_105', 'omgevingsadressendichtheid_106']]

#Drop the duplicates ()
distancesbcc = distancesbcc.drop_duplicates(subset='buurtcode', keep="first", inplace=False)

#Change column names
distancesbcc.columns.values[0] = "buurtcode" #Change colname to english
distancesbcc.columns.values[1] = "Avg Distance to General Practice Neighborhood" #Change colname to english
distancesbcc.columns.values[2] = "Avg Distance to Grocery Store Neighborhood" #Change colname to english
distancesbcc.columns.values[3] = "Avg Distance to Daycare Neighborhood" #Change colname to english
distancesbcc.columns.values[4] = "Avg Distance to School Neighborhood" #Change colname to english
distancesbcc.columns.values[5] = "Avg Schools Within 3KM Neighborhood" #Change colname to english
distancesbcc.columns.values[6] = "Avg Degree of Urbanity Neighborhood" #Change colname to english
distancesbcc.columns.values[7] = "Avg Environmental Address Density Neighborhood" #Change colname to english

#Perform joins 
distancezip = pd.merge(zipp, distancesgcc, left_on = 'gemeente2019', right_on = 'gemcode', how = 'inner')  
distancezip = pd.merge(distancezip, distanceswcc, left_on = 'wijk2019', right_on = 'wijkcode', how = 'inner')  ##
distancezip = pd.merge(distancezip, distancesbcc, left_on = 'buurt2019', right_on = 'buurtcode', how = 'inner') 

#Overview of dataset 
#demographiczip #Containing 450086 rows, zipp has 488315 rows -> 38229 rows (postcodes) lost because of missing data.

#Clean column names for energyzip

#Select columns
DistanceZipClean = distancezip[['postcode', 'gemeentenaam2019', 'wijknaam_2019k_naam', 'buurtnaam_2019', 
                                'Avg Distance to General Practice Municipality', 'Avg Distance to Grocery Store Municipality', 'Avg Distance to Daycare Municipality', 'Avg Distance to School Municipality',
                                'Avg Schools Within 3KM Municipality', 'Avg Degree of Urbanity Municipality', 'Avg Environmental Address Density Municipality', 
                                'Avg Distance to General Practice District', 'Avg Distance to Grocery Store District', 'Avg Distance to Daycare District', 'Avg Distance to School District',
                                'Avg Schools Within 3KM District', 'Avg Degree of Urbanity District', 'Avg Environmental Address Density District', 
                                'Avg Distance to General Practice Neighborhood', 'Avg Distance to Grocery Store Neighborhood', 'Avg Distance to Daycare Neighborhood', 'Avg Distance to School Neighborhood',
                                'Avg Schools Within 3KM Neighborhood', 'Avg Degree of Urbanity Neighborhood', 'Avg Environmental Address Density Neighborhood'
                                ]]


#Change colnames
DistanceZipClean.columns.values[0] = "Postcode"
DistanceZipClean.columns.values[1] = "Municipality"
DistanceZipClean.columns.values[2] = "District"
DistanceZipClean.columns.values[3] = "Neighborhood"

#Overview
DistanceZipClean.to_sql('DistanceZipClean', engine) #450516, 488315 rows in zipcodes -> 37799 zipcodes lost because of missing data.

In [49]:
#Municipality code can consist of 1-4 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in pop.density where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Gemeentecode with 4 digits
population_density['gemcode'] = population_density['wijkenenbuurten'].str[-4:] 
population_density_gc = population_density.loc[(population_density['soortregio_2'] == 'Gemeente')] 

#Gemeentecode with 3 digits
population_density['gemcode'] = population_density['wijkenenbuurten'].str[-3:]
population_density_gc1 = population_density.loc[(population_density['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 2 digits
population_density['gemcode'] = population_density['wijkenenbuurten'].str[-2:]
population_density_gc2 = population_density.loc[(population_density['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 1 digits
population_density['gemcode'] = population_density['wijkenenbuurten'].str[-1:]
population_density_gc3 = population_density.loc[(population_density['soortregio_2'] == 'Gemeente')]

#Merge with concat (all column names are the same)
population_density_gcc = pd.concat([population_density_gc, population_density_gc1, population_density_gc2, population_density_gc3])

#Select usable columns
population_density_gcc = population_density_gcc[['gemcode', 'bevolkingsdichtheid_33']]

#Drop the duplicates ()
population_density_gcc = population_density_gcc.drop_duplicates(subset='gemcode', keep="first", inplace=False)

#Change column names
population_density_gcc.columns.values[0] = "gemcode" #Change colname to english
population_density_gcc.columns.values[1] = "population_density_Municipality" #Change colname to english

#District code can consist of 3 - 6 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in population_density where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Wijkcode with 6 digits
population_density['wijkcode'] = population_density['wijkenenbuurten'].str[-6:]
population_density_wc = population_density.loc[(population_density['soortregio_2'] == 'Wijk')] 

#Wijkcode with 5 digits
population_density['wijkcode'] = population_density['wijkenenbuurten'].str[-5:]
population_density_wc1 = population_density.loc[(population_density['soortregio_2'] == 'Wijk')]

#Wijkcode with 4 digits
population_density['wijkcode'] = population_density['wijkenenbuurten'].str[-4:]
population_density_wc2 = population_density.loc[(population_density['soortregio_2'] == 'Wijk')]

#Wijkcode with 3 digits
population_density['wijkcode'] = population_density['wijkenenbuurten'].str[-3:]
population_density_wc3 = population_density.loc[(population_density['soortregio_2'] == 'Wijk')]

#Merge with concat (all column names are the same)
population_density_wcc = pd.concat([population_density_wc, population_density_wc1, population_density_wc2, population_density_wc3])

#Select usable columns
population_density_wcc = population_density_wcc[['wijkcode', 'bevolkingsdichtheid_33']]

#Drop the duplicates ()
population_density_wcc = population_density_wcc.drop_duplicates(subset='wijkcode', keep="first", inplace=False)

#Change column names
population_density_wcc.columns.values[0] = "wijkcode" #Change colname to english
population_density_wcc.columns.values[1] = "population_density_District" #Change colname to english

#Neighborhood code can consist of 5 - 8 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in population_density where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Buurtcodes with 8 digits
population_density['buurtcode'] = population_density['wijkenenbuurten'].str[-8:]
population_density_bc = population_density.loc[(population_density['soortregio_2'] == 'Buurt')] 

#Buurtcodes with 7 digits
population_density['buurtcode'] = population_density['wijkenenbuurten'].str[-7:]
population_density_bc1 = population_density.loc[(population_density['soortregio_2'] == 'Buurt')]

#Buurtcodes with 6 digits
population_density['buurtcode'] = population_density['wijkenenbuurten'].str[-6:]
population_density_bc2 = population_density.loc[(population_density['soortregio_2'] == 'Buurt')]

#Buurtcodes with 5 digits
population_density['buurtcode'] = population_density['wijkenenbuurten'].str[-5:]
population_density_bc3 = population_density.loc[(population_density['soortregio_2'] == 'Buurt')]

#Merge with concat (all column names are the same)
population_density_bcc = pd.concat([population_density_bc, population_density_bc1, population_density_bc2, population_density_bc3])

#Select usable columns
population_density_bcc = population_density_bcc[['buurtcode', 'bevolkingsdichtheid_33']]

#Drop the duplicates ()
population_density_bcc = population_density_bcc.drop_duplicates(subset='buurtcode', keep="first", inplace=False)

#Change column names
population_density_bcc.columns.values[0] = "buurtcode" #Change colname to english
population_density_bcc.columns.values[1] = "population_density_Neighborhood" #Change colname to english

#Perform joins 
population_density_zip = pd.merge(zipp, population_density_gcc, left_on = 'gemeente2019', right_on = 'gemcode', how = 'inner')  
population_density_zip = pd.merge(population_density_zip, population_density_wcc, left_on = 'wijk2019', right_on = 'wijkcode', how = 'inner')  ##
population_density_zip = pd.merge(population_density_zip, population_density_bcc, left_on = 'buurt2019', right_on = 'buurtcode', how = 'inner') 


#Select columns
population_density_zip_clean = population_density_zip[['postcode', 'gemeentenaam2019', 'wijknaam_2019k_naam', 'buurtnaam_2019', 'population_density_Municipality', 'population_density_District','population_density_Neighborhood']]

#Change colnames
population_density_zip_clean.columns.values[0] = "Postcode"
population_density_zip_clean.columns.values[1] = "Municipality"
population_density_zip_clean.columns.values[2] = "District"
population_density_zip_clean.columns.values[3] = "Neighborhood"

#Overview
population_density_zip_clean.to_sql('PopulationDensityZipClean', engine) #####################################

In [51]:
#Municipality code can consist of 1-4 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in cbshousing2018 where soortregio is Buurt, we collect all the neighborhood codes, which we can join later with the houses dataset.

#Gemeentecode with 4 digits
cbshousing2018['gemcode'] = cbshousing2018['wijkenenbuurten'].str[-4:] ##.str[-4:] selects the last for characters from a column
cbshousing2018gc = cbshousing2018.loc[(cbshousing2018['soortregio_2'] == 'Gemeente')] 

#Gemeentecode with 3 digits
cbshousing2018['gemcode'] = cbshousing2018['wijkenenbuurten'].str[-3:]
cbshousing2018gc1 = cbshousing2018.loc[(cbshousing2018['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 2 digits
cbshousing2018['gemcode'] = cbshousing2018['wijkenenbuurten'].str[-2:]
cbshousing2018gc2 = cbshousing2018.loc[(cbshousing2018['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 1 digits
cbshousing2018['gemcode'] = cbshousing2018['wijkenenbuurten'].str[-1:]
cbshousing2018gc3 = cbshousing2018.loc[(cbshousing2018['soortregio_2'] == 'Gemeente')]

#Merge with concat (all column names are the same)
cbshousing2018gcc = pd.concat([cbshousing2018gc, cbshousing2018gc1, cbshousing2018gc2, cbshousing2018gc3])

#Select usable columns
cbshousing2018gcc = cbshousing2018gcc[['gemcode', 'woningvoorraad_34', 'gemiddeldewoningwaarde_35', 'bouwjaarvoor2000_45', 'bouwjaarvanaf2000_46']]

#Drop the duplicates ()
cbshousing2018gcc = cbshousing2018gcc.drop_duplicates(subset='gemcode', keep="first", inplace=False)

#Change column names
cbshousing2018gcc.columns.values[0] = "gemcode" #Change colname to english
cbshousing2018gcc.columns.values[1] = "Total Number of Houses Municipality" #Change colname to english
cbshousing2018gcc.columns.values[2] = "Average House Value Municipality" #Change colname to english
cbshousing2018gcc.columns.values[3] = "Houses Built Before 2000 Municipality" #Change colname to english
cbshousing2018gcc.columns.values[4] = "Houses Built After 2000 Municipality" #Change colname to english



#District code can consist of 3 - 6 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in cbshousing2018 where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Wijkcode with 6 digits
cbshousing2018['wijkcode'] = cbshousing2018['wijkenenbuurten'].str[-6:]
cbshousing2018wc = cbshousing2018.loc[(cbshousing2018['soortregio_2'] == 'Wijk')] 

#Wijkcode with 5 digits
cbshousing2018['wijkcode'] = cbshousing2018['wijkenenbuurten'].str[-5:]
cbshousing2018wc1 = cbshousing2018.loc[(cbshousing2018['soortregio_2'] == 'Wijk')]

#Wijkcode with 4 digits
cbshousing2018['wijkcode'] = cbshousing2018['wijkenenbuurten'].str[-4:]
cbshousing2018wc2 = cbshousing2018.loc[(cbshousing2018['soortregio_2'] == 'Wijk')]

#Wijkcode with 3 digits
cbshousing2018['wijkcode'] = cbshousing2018['wijkenenbuurten'].str[-3:]
cbshousing2018wc3 = cbshousing2018.loc[(cbshousing2018['soortregio_2'] == 'Wijk')]

#Merge with concat (all column names are the same)
cbshousing2018wcc = pd.concat([cbshousing2018wc, cbshousing2018wc1, cbshousing2018wc2, cbshousing2018wc3])

#Select usable columns
cbshousing2018wcc = cbshousing2018wcc[['wijkcode', 'woningvoorraad_34', 'gemiddeldewoningwaarde_35', 'bouwjaarvoor2000_45', 'bouwjaarvanaf2000_46']]

#Drop the duplicates () CAUSES ERROR
cbshousing2018wcc = cbshousing2018wcc.drop_duplicates(subset='wijkcode', keep="first", inplace=False)

#Change column names
cbshousing2018wcc.columns.values[0] = "gemcode" #Change colname to english
cbshousing2018wcc.columns.values[1] = "Total Number of Houses District" #Change colname to english
cbshousing2018wcc.columns.values[2] = "Average House Value District" #Change colname to english
cbshousing2018wcc.columns.values[3] = "Houses Built Before 2000 District" #Change colname to english
cbshousing2018wcc.columns.values[4] = "Houses Built After 2000 District" #Change colname to english



#Neighborhood code can consist of 5 - 8 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Buurtcodes with 8 digits
cbshousing2018['buurtcode'] = cbshousing2018['wijkenenbuurten'].str[-8:]
cbshousing2018bc = cbshousing2018.loc[(cbshousing2018['soortregio_2'] == 'Buurt')] 

#Buurtcodes with 7 digits
cbshousing2018['buurtcode'] = cbshousing2018['wijkenenbuurten'].str[-7:]
cbshousing2018bc1 = cbshousing2018.loc[(cbshousing2018['soortregio_2'] == 'Buurt')]

#Buurtcodes with 6 digits
cbshousing2018['buurtcode'] = cbshousing2018['wijkenenbuurten'].str[-6:]
cbshousing2018bc2 = cbshousing2018.loc[(cbshousing2018['soortregio_2'] == 'Buurt')]

#Buurtcodes with 5 digits
cbshousing2018['buurtcode'] = cbshousing2018['wijkenenbuurten'].str[-5:]
cbshousing2018bc3 = cbshousing2018.loc[(cbshousing2018['soortregio_2'] == 'Buurt')]

#Merge with concat (all column names are the same)
cbshousing2018bcc = pd.concat([cbshousing2018bc, cbshousing2018bc1, cbshousing2018bc2, cbshousing2018bc3])

#Select usable columns
cbshousing2018bcc = cbshousing2018bcc[['buurtcode', 'woningvoorraad_34', 'gemiddeldewoningwaarde_35', 'bouwjaarvoor2000_45', 'bouwjaarvanaf2000_46']]

#Drop the duplicates ()
cbshousing2018bcc = cbshousing2018bcc.drop_duplicates(subset='buurtcode', keep="first", inplace=False)

#Change column names
cbshousing2018bcc.columns.values[0] = "gemcode" #Change colname to english
cbshousing2018bcc.columns.values[1] = "Total Number of Houses Neighborhood" #Change colname to english
cbshousing2018bcc.columns.values[2] = "Average House Value Neighborhood" #Change colname to english
cbshousing2018bcc.columns.values[3] = "Houses Built Before 2000 Neighborhood" #Change colname to english
cbshousing2018bcc.columns.values[4] = "Houses Built After 2000 Neighborhood" #Change colname to english


#Perform joins 
cbshousing2018zip = pd.merge(zipp, cbshousing2018gcc, left_on = 'gemeente2019', right_on = 'gemcode', how = 'inner')  
cbshousing2018zip = pd.merge(cbshousing2018zip, cbshousing2018wcc, left_on = 'wijk2019', right_on = 'wijkcode', how = 'inner') 
cbshousing2018zip = pd.merge(cbshousing2018zip, cbshousing2018bcc, left_on = 'buurt2019', right_on = 'buurtcode', how = 'inner') 

#Overview of dataset 
#cbshousing2018zip #Containing 453185 rows, zipp has 488315 rows -> 35130 rows (postcodes) lost because of missing data.

#Clean column names for new columns in cbshousing2018zip
#Change colnames
cbshousing2018zip.columns.values[0] = "Postcode"
cbshousing2018zip.columns.values[2] = "Municipality"
cbshousing2018zip.columns.values[4] = "District"
cbshousing2018zip.columns.values[6] = "Neighborhood"

CBShousingCleanZip = cbshousing2018zip[['Postcode', 'Municipality', 'District', 'Neighborhood', 
                                        'Total Number of Houses Municipality', 'Average House Value Municipality', 'Houses Built Before 2000 Municipality', 'Houses Built After 2000 Municipality',
                                        'Total Number of Houses District', 'Average House Value District', 'Houses Built Before 2000 District', 'Houses Built After 2000 District',
                                        'Total Number of Houses Neighborhood', 'Average House Value Neighborhood', 'Houses Built Before 2000 Neighborhood', 'Houses Built After 2000 Neighborhood']]


CBShousingCleanZip.to_sql('CBShousingCleanZip', engine) ##############################

In [52]:
#Extract gemeentecode, wijkcode and buurtcode from energy and merge with zipp

#Municipality code can consist of 1-4 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Gemeentecode with 4 digits
age_groups['gemcode'] = age_groups['wijkenenbuurten'].str[-4:] ##.str[-4:] selects the last for characters from a column
age_groups_gc = age_groups.loc[(age_groups['soortregio_2'] == 'Gemeente')] 

#Gemeentecode with 3 digits
age_groups['gemcode'] = age_groups['wijkenenbuurten'].str[-3:]
age_groups_gc1 = age_groups.loc[(age_groups['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 2 digits
age_groups['gemcode'] = age_groups['wijkenenbuurten'].str[-2:]
age_groups_gc2 = age_groups.loc[(age_groups['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 1 digits
age_groups['gemcode'] = age_groups['wijkenenbuurten'].str[-1:]
age_groups_gc3 = age_groups.loc[(age_groups['soortregio_2'] == 'Gemeente')]

#Merge with concat (all column names are the same)
age_groups_gcc = pd.concat([age_groups_gc, age_groups_gc1, age_groups_gc2, age_groups_gc3])

#Select usable columns
age_groups_gcc = age_groups_gcc[['gemcode', 'k_0tot15jaar_8', 'k_15tot25jaar_9', 'k_25tot45jaar_10', 'k_45tot65jaar_11', 'k_65jaarofouder_12']]

#Drop the duplicates ()
age_groups_gcc = age_groups_gcc.drop_duplicates(subset='gemcode', keep="first", inplace=False)

#Change column names
age_groups_gcc.columns.values[0] = "gemcode" #Change colname to english
age_groups_gcc.columns.values[1] = "0_15_years_Municipality" #Change colname to english
age_groups_gcc.columns.values[2] = "15_25_years_Municipality" #Change colname to english
age_groups_gcc.columns.values[3] = "25_45_years_Municipality" #Change colname to english
age_groups_gcc.columns.values[4] = "45_65_years_Municipality" #Change colname to english
age_groups_gcc.columns.values[5] = "65_older_years_Municipality" #Change colname to english

#District code can consist of 3 - 6 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Wijkcode with 6 digits
age_groups['wijkcode'] = age_groups['wijkenenbuurten'].str[-6:]
age_groups_wc = age_groups.loc[(age_groups['soortregio_2'] == 'Wijk')] 

#Wijkcode with 5 digits
age_groups['wijkcode'] = age_groups['wijkenenbuurten'].str[-5:]
age_groups_wc1 = age_groups.loc[(age_groups['soortregio_2'] == 'Wijk')]

#Wijkcode with 4 digits
age_groups['wijkcode'] = age_groups['wijkenenbuurten'].str[-4:]
age_groups_wc2 = age_groups.loc[(age_groups['soortregio_2'] == 'Wijk')]

#Wijkcode with 3 digits
age_groups['wijkcode'] = age_groups['wijkenenbuurten'].str[-3:]
age_groups_wc3 = age_groups.loc[(age_groups['soortregio_2'] == 'Wijk')]

#Merge with concat (all column names are the same)
age_groups_wcc = pd.concat([age_groups_wc, age_groups_wc1, age_groups_wc2, age_groups_wc3])

#Select usable columns
age_groups_wcc = age_groups_wcc[['wijkcode', 'k_0tot15jaar_8', 'k_15tot25jaar_9', 'k_25tot45jaar_10', 'k_45tot65jaar_11', 'k_65jaarofouder_12']]

#Drop the duplicates ()
age_groups_wcc = age_groups_wcc.drop_duplicates(subset='wijkcode', keep="first", inplace=False)

#Change column names
age_groups_wcc.columns.values[0] = "wijkcode" #Change colname to english
age_groups_wcc.columns.values[1] = "0_15_years_District" #Change colname to english
age_groups_wcc.columns.values[2] = "15_25_years_District" #Change colname to english
age_groups_wcc.columns.values[3] = "25_45_years_District" #Change colname to english
age_groups_wcc.columns.values[4] = "45_65_years_District" #Change colname to english
age_groups_wcc.columns.values[5] = "65_older_years_District" #Change colname to english

#Neighborhood code can consist of 5 - 8 numbers. By selecting the last 5 to 8 digits from wijkenenbuurten in energy where soortregio is Buurt, we collect all the neighborhood codes.
#Which we can join later with the houses dataset

#Buurtcodes with 8 digits
age_groups['buurtcode'] = age_groups['wijkenenbuurten'].str[-8:]
age_groups_bc = age_groups.loc[(age_groups['soortregio_2'] == 'Buurt')] 

#Buurtcodes with 7 digits
age_groups['buurtcode'] = age_groups['wijkenenbuurten'].str[-7:]
age_groups_bc1 = age_groups.loc[(age_groups['soortregio_2'] == 'Buurt')]

#Buurtcodes with 6 digits
age_groups['buurtcode'] = age_groups['wijkenenbuurten'].str[-6:]
age_groups_bc2 = age_groups.loc[(age_groups['soortregio_2'] == 'Buurt')]

#Buurtcodes with 5 digits
age_groups['buurtcode'] = age_groups['wijkenenbuurten'].str[-5:]
age_groups_bc3 = age_groups.loc[(age_groups['soortregio_2'] == 'Buurt')]

#Merge with concat (all column names are the same)
age_groups_bcc = pd.concat([age_groups_bc, age_groups_bc1, age_groups_bc2, age_groups_bc3])

#Select usable columns
age_groups_bcc = age_groups_bcc[['buurtcode', 'k_0tot15jaar_8', 'k_15tot25jaar_9', 'k_25tot45jaar_10', 'k_45tot65jaar_11', 'k_65jaarofouder_12']]

#Drop the duplicates ()
age_groups_bcc = age_groups_bcc.drop_duplicates(subset='buurtcode', keep="first", inplace=False)

#Change column names
age_groups_bcc.columns.values[0] = "buurtcode" #Change colname to english
age_groups_bcc.columns.values[1] = "0_15_years_Neighborhood" #Change colname to english
age_groups_bcc.columns.values[2] = "15_25_years_Neighborhood" #Change colname to english
age_groups_bcc.columns.values[3] = "25_45_years_Neighborhood" #Change colname to english
age_groups_bcc.columns.values[4] = "45_65_years_Neighborhood" #Change colname to english
age_groups_bcc.columns.values[5] = "65_older_years_Neighborhood"#Change colname to english

#Inner join the the datasets age_groups_gcc, age_groups_dcc and age_groups_bcc to the zipp
#Perform joins age_groups and zip
age_groups_zip = pd.merge(zipp, age_groups_gcc, left_on = 'gemeente2019', right_on = 'gemcode', how = 'inner')  
age_groups_zip = pd.merge(age_groups_zip, age_groups_wcc, left_on = 'wijk2019', right_on = 'wijkcode', how = 'inner') 
age_groups_zip = pd.merge(age_groups_zip, age_groups_bcc, left_on = 'buurt2019', right_on = 'buurtcode', how = 'inner') 

#Overview of dataset age_groups_zip
#age_groups_zip #Containing 453185 rows, zipp has 488315 rows -> 35130 rows (postcodes) lost because of missing data.

#Clean column names for age_groups_zip

#Select columns
age_groups_zip_clean = age_groups_zip[['postcode', 'gemeentenaam2019', 'wijknaam_2019k_naam', 'buurtnaam_2019', 
                                       '0_15_years_Municipality', '15_25_years_Municipality', '25_45_years_Municipality', '45_65_years_Municipality', '65_older_years_Municipality',
                                       '0_15_years_District', '15_25_years_District', '25_45_years_District', '45_65_years_District', '65_older_years_District',
                                       '0_15_years_Neighborhood', '15_25_years_Neighborhood', '25_45_years_Neighborhood', '45_65_years_Neighborhood', '65_older_years_Neighborhood']]

#Change colnames
age_groups_zip_clean.columns.values[0] = "Postcode"
age_groups_zip_clean.columns.values[1] = "Municipality"
age_groups_zip_clean.columns.values[2] = "District"
age_groups_zip_clean.columns.values[3] = "Neighborhood"

#Overview
age_groups_zip_clean.to_sql('AgeGroupsZipClean', engine) #453185, 488315 rows in zipcodes -> 35130 zipcodes lost because of missing data.

#Create table with average house price per age group per municipality, district and neighborhood

In [53]:
#Perform join with houses
housesagegroups = pd.merge(houses, age_groups_zip_clean, left_on = 'postcode', right_on = 'Postcode', how = 'inner')

#Delete duplicated from globalid
housesagegroups = housesagegroups.drop_duplicates(subset='globalid', keep="first", inplace=False)


########Create new columns price per age group
#Filtered out age group 0 - 15, because at that age you don't buy a house

#New column Total inhabitants Municipality, District, Neighborhood
housesagegroups['Total inhabitants Municipality'] = housesagegroups['15_25_years_Municipality'] + housesagegroups['25_45_years_Municipality'] + housesagegroups['45_65_years_Municipality'] +  housesagegroups['65_older_years_Municipality']
housesagegroups['Total inhabitants District'] = housesagegroups['15_25_years_District'] + housesagegroups['25_45_years_District'] + housesagegroups['45_65_years_District'] +  housesagegroups['65_older_years_District']
housesagegroups['Total inhabitants Neighborhood'] = housesagegroups['15_25_years_Neighborhood'] + housesagegroups['25_45_years_Neighborhood'] + housesagegroups['45_65_years_Neighborhood'] +  housesagegroups['65_older_years_Neighborhood']

#Calculate avg selling price in the nehtherlands
housesagegroups['Avg Selling Price Netherlands'] = housesagegroups['koopprijs'].mean()
housesagegroups['Avg Selling Price Netherlands'] = housesagegroups['Avg Selling Price Netherlands'].round(2)

#Avg price per 15-25 years for M, D and N and round to 2 decimanls
housesagegroups['Avg Price per 15-25 Years Municipality'] = (housesagegroups['15_25_years_Municipality']/housesagegroups['Total inhabitants Municipality']) * (housesagegroups['Avg Selling Price Netherlands'])
housesagegroups['Avg Price per 15-25 Years Municipality'] = housesagegroups['Avg Price per 15-25 Years Municipality'].round(2)

housesagegroups['Avg Price per 15-25 Years District'] = (housesagegroups['15_25_years_District']/housesagegroups['Total inhabitants District']) * (housesagegroups['Avg Selling Price Netherlands'])
housesagegroups['Avg Price per 15-25 Years District'] = housesagegroups['Avg Price per 15-25 Years District'].round(2)

housesagegroups['Avg Price per 15-25 Years Neighborhood'] = (housesagegroups['15_25_years_Neighborhood']/housesagegroups['Total inhabitants Neighborhood']) * (housesagegroups['Avg Selling Price Netherlands'])
housesagegroups['Avg Price per 15-25 Years Neighborhood'] = housesagegroups['Avg Price per 15-25 Years Neighborhood'].round(2)

#Avg price per 25-45 years for M, D and N and round to 2 decimanls
housesagegroups['Avg Price per 25-45 Years Municipality'] = (housesagegroups['25_45_years_Municipality']/housesagegroups['Total inhabitants Municipality']) * (housesagegroups['Avg Selling Price Netherlands'])
housesagegroups['Avg Price per 25-45 Years Municipality'] = housesagegroups['Avg Price per 25-45 Years Municipality'].round(2)

housesagegroups['Avg Price per 25-45 Years District'] = (housesagegroups['25_45_years_District']/housesagegroups['Total inhabitants District']) * (housesagegroups['Avg Selling Price Netherlands'])
housesagegroups['Avg Price per 25-45 Years District'] = housesagegroups['Avg Price per 25-45 Years District'].round(2)

housesagegroups['Avg Price per 25-45 Years Neighborhood'] = (housesagegroups['25_45_years_Neighborhood']/housesagegroups['Total inhabitants Neighborhood']) * (housesagegroups['Avg Selling Price Netherlands'])
housesagegroups['Avg Price per 25-45 Years Neighborhood'] = housesagegroups['Avg Price per 25-45 Years Neighborhood'].round(2)

#Avg price per 45-65 years for M, D and N and round to 2 decimanls
housesagegroups['Avg Price per 45-65 Years Municipality'] = (housesagegroups['45_65_years_Municipality']/housesagegroups['Total inhabitants Municipality']) * (housesagegroups['Avg Selling Price Netherlands'])
housesagegroups['Avg Price per 45-65 Years Municipality'] = housesagegroups['Avg Price per 45-65 Years Municipality'].round(2)

housesagegroups['Avg Price per 45-65 Years District'] = (housesagegroups['45_65_years_District']/housesagegroups['Total inhabitants District']) * (housesagegroups['Avg Selling Price Netherlands'])
housesagegroups['Avg Price per 45-65 Years District'] = housesagegroups['Avg Price per 45-65 Years District'].round(2)

housesagegroups['Avg Price per 45-65 Years Neighborhood'] = (housesagegroups['45_65_years_Neighborhood']/housesagegroups['Total inhabitants Neighborhood']) * (housesagegroups['Avg Selling Price Netherlands'])
housesagegroups['Avg Price per 45-65 Years Neighborhood'] = housesagegroups['Avg Price per 45-65 Years Neighborhood'].round(2)

#Avg price per 65+ years for M, D and N and round to 2 decimanls
housesagegroups['Avg Price per 65+ Years Municipality'] = (housesagegroups['65_older_years_Municipality']/housesagegroups['Total inhabitants Municipality']) * (housesagegroups['Avg Selling Price Netherlands'])
housesagegroups['Avg Price per 65+ Years Municipality'] = housesagegroups['Avg Price per 65+ Years Municipality'].round(2)

housesagegroups['Avg Price per 65+ Years District'] = (housesagegroups['65_older_years_District']/housesagegroups['Total inhabitants District']) * (housesagegroups['Avg Selling Price Netherlands'])
housesagegroups['Avg Price per 65+ Years District'] = housesagegroups['Avg Price per 65+ Years District'].round(2)

housesagegroups['Avg Price per 65+ Years Neighborhood'] = (housesagegroups['65_older_years_Neighborhood']/housesagegroups['Total inhabitants Neighborhood']) * (housesagegroups['Avg Selling Price Netherlands'])
housesagegroups['Avg Price per 65+ Years Neighborhood'] = housesagegroups['Avg Price per 65+ Years Neighborhood'].round(2)

#Overview of dataset
#housesagegroups #193241 rows, houses contains 211617 rows -> 18376 rows lost becuase of missing data.

#Select useful columns
PricePerAgeGroupMDN = housesagegroups[['Municipality', 'District', 'Neighborhood',  'Avg Price per 15-25 Years Municipality',
                                       'Avg Price per 25-45 Years Municipality', 'Avg Price per 45-65 Years Municipality', 'Avg Price per 65+ Years Municipality', 
                                       'Avg Price per 15-25 Years District', 'Avg Price per 25-45 Years District', 'Avg Price per 45-65 Years District', 'Avg Price per 65+ Years District', 
                                       'Avg Price per 15-25 Years Neighborhood', 'Avg Price per 25-45 Years Neighborhood', 'Avg Price per 45-65 Years Neighborhood','Avg Price per 65+ Years Neighborhood'
                                     ]]

PricePerAgeGroupMDN = PricePerAgeGroupMDN.drop_duplicates(subset=None, keep="first", inplace=False)


PricePerAgeGroupMDN.to_sql('PricePerAgeGroupInMDN', engine) #####################################

Average selling price per municipality ordered by average income

In [54]:
#Gemeentecode with 4 digits
economic['gemcode'] = economic['wijkenenbuurten'].str[-4:] ##.str[-4:] selects the last for characters from a column
economicgc = economic.loc[(economic['soortregio_2'] == 'Gemeente')] 

#Gemeentecode with 3 digits
economic['gemcode'] = economic['wijkenenbuurten'].str[-3:]
economicgc1 = economic.loc[(economic['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 2 digits
economic['gemcode'] = economic['wijkenenbuurten'].str[-2:]
economicgc2 = economic.loc[(economic['soortregio_2'] == 'Gemeente')]

#Gemeentecode with 1 digits
economic['gemcode'] = economic['wijkenenbuurten'].str[-1:]
economicgc3 = economic.loc[(economic['soortregio_2'] == 'Gemeente')]

#Merge with concat (all column names are the same)
economicgcc = pd.concat([economicgc, economicgc1, economicgc2, economicgc3])

#Select usable columns
economicgcc = economicgcc[['gemcode', 'gemeentenaam_1', 'gemiddeldinkomenperinwoner_66']]

#Drop the duplicates ()
economicgcc = economicgcc.drop_duplicates(subset='gemcode', keep="first", inplace=False)

#Change column names
economicgcc.columns.values[0] = "gemcode" #Change colname to english
economicgcc.columns.values[1] = "Municipality" #Change colname to english
economicgcc.columns.values[2] = "Average Income" #Change colname to english

#Merge the tables Zipp and incomegcc to create a bridge towards the houses database
zippincome = pd.merge(zipp,economicgcc,left_on='gemeente2019',right_on='gemcode', how='inner')

#Join houses with zippincome and delete duplicates
housesincome = pd.merge(houses, zippincome, on = 'postcode', how = 'inner')
housesincome = housesincome.drop_duplicates(subset='globalid', keep="first", inplace=False)

#Select useful columns
housesincome = housesincome[['Municipality', 'Average Income', 'koopprijs']]

#Groupby Municipality and average income and calculate the average selling price
MuniAvgPriceIncome = housesincome.groupby(['Municipality', 'Average Income']).mean().reset_index()

#Change column names
MuniAvgPriceIncome.columns.values[2] = "Average Selling Price" #Change colname to english

#Sort by Average income
MuniAvgPriceIncome = MuniAvgPriceIncome.sort_values(by=['Average Income'], ascending=False)

MuniAvgPriceIncome.to_sql('AvgPriceMunicipalityAvgIncome', engine) ##################################

Percentage change for average selling price per muni, district and neighborhood

In [69]:
#Create new datafram which groups by municipality and selling month and calculates the mean selling price per month
MunicMonthlyAvg = housescomplete[['gemeentenaam2019', 'SellingMonth', 'koopprijs']]
MunicMonthlyAvg = MunicMonthlyAvg.groupby(['gemeentenaam2019', 'SellingMonth']).mean().reset_index()

#Change column names where needed
MunicMonthlyAvg.columns.values[0] = "Municipality" #Change colname to english
MunicMonthlyAvg.columns.values[1] = "Month" #Change colname to english
MunicMonthlyAvg.columns.values[2] = "Average Selling Price" #Change colname to english

#Get month names from month numbers
MunicMonthlyAvg.loc[MunicMonthlyAvg['Month'] == '01', 'Month'] = 'January'
MunicMonthlyAvg.loc[MunicMonthlyAvg['Month'] == '02', 'Month'] = 'February'
MunicMonthlyAvg.loc[MunicMonthlyAvg['Month'] == '03', 'Month'] = 'March'
MunicMonthlyAvg.loc[MunicMonthlyAvg['Month'] == '04', 'Month'] = 'April'
MunicMonthlyAvg.loc[MunicMonthlyAvg['Month'] == '05', 'Month'] = 'May'
MunicMonthlyAvg.loc[MunicMonthlyAvg['Month'] == '06', 'Month'] = 'June'
MunicMonthlyAvg.loc[MunicMonthlyAvg['Month'] == '07', 'Month'] = 'July'
MunicMonthlyAvg.loc[MunicMonthlyAvg['Month'] == '08', 'Month'] = 'August'
MunicMonthlyAvg.loc[MunicMonthlyAvg['Month'] == '09', 'Month'] = 'September'
MunicMonthlyAvg.loc[MunicMonthlyAvg['Month'] == '10', 'Month'] = 'October'
MunicMonthlyAvg.loc[MunicMonthlyAvg['Month'] == '11', 'Month'] = 'November'
MunicMonthlyAvg.loc[MunicMonthlyAvg['Month'] == '12', 'Month'] = 'December'

#Percentage change between rows
MunicMonthlyAvg['Percentage Change'] = ((MunicMonthlyAvg['Average Selling Price'].pct_change())*100).round(2)

#Remove NA
MunicMonthlyAvg['Percentage Change'] = MunicMonthlyAvg['Percentage Change'].fillna(0)

#Round Avg Selling Price to 2 decimals
MunicMonthlyAvg['Average Selling Price'] = MunicMonthlyAvg['Average Selling Price'].round(2)

#Overview
MunicMonthlyAvg.to_sql('MunicMonthlyAvgPctChange', engine) ############################



#Create new datafram which groups by municipality, district and selling month and calculates the mean selling price per month
DistMonthlyAvg = housescomplete[['wijknaam_2019k_naam', 'gemeentenaam2019', 'SellingMonth', 'koopprijs']]

#Change column names
DistMonthlyAvg.columns.values[0] = "District" #Change colname to english
DistMonthlyAvg.columns.values[1] = "Municipality" #Change colname to english
DistMonthlyAvg.columns.values[2] = "Month" #Change colname to english
DistMonthlyAvg.columns.values[3] = "Average Selling Price" #Change colname to english

#Groupby
DistMonthlyAvg = DistMonthlyAvg.groupby(['District', 'Municipality', 'Month']).mean().reset_index()

#Get month names from month
DistMonthlyAvg.loc[DistMonthlyAvg['Month'] == '01', 'Month'] = 'January'
DistMonthlyAvg.loc[DistMonthlyAvg['Month'] == '02', 'Month'] = 'February'
DistMonthlyAvg.loc[DistMonthlyAvg['Month'] == '03', 'Month'] = 'March'
DistMonthlyAvg.loc[DistMonthlyAvg['Month'] == '04', 'Month'] = 'April'
DistMonthlyAvg.loc[DistMonthlyAvg['Month'] == '05', 'Month'] = 'May'
DistMonthlyAvg.loc[DistMonthlyAvg['Month'] == '06', 'Month'] = 'June'
DistMonthlyAvg.loc[DistMonthlyAvg['Month'] == '07', 'Month'] = 'July'
DistMonthlyAvg.loc[DistMonthlyAvg['Month'] == '08', 'Month'] = 'August'
DistMonthlyAvg.loc[DistMonthlyAvg['Month'] == '09', 'Month'] = 'September'
DistMonthlyAvg.loc[DistMonthlyAvg['Month'] == '10', 'Month'] = 'October'
DistMonthlyAvg.loc[DistMonthlyAvg['Month'] == '11', 'Month'] = 'November'
DistMonthlyAvg.loc[DistMonthlyAvg['Month'] == '12', 'Month'] = 'December'

#Percentage change between rows
DistMonthlyAvg['Percentage Change'] = ((DistMonthlyAvg['Average Selling Price'].pct_change())*100).round(2)

#Remove NA
DistMonthlyAvg['Percentage Change'] = DistMonthlyAvg['Percentage Change'].fillna(0)

#Round Avg Selling Price to 2 decimals
DistMonthlyAvg['Average Selling Price'] = DistMonthlyAvg['Average Selling Price'].round(2)

#Overview
DistMonthlyAvg.to_sql('DistMonthlyAvgPctChange', engine) ############################



#Create new datafram which groups by municipality, district and neighborhood and selling month and calculates the mean selling price per month
BuurtMonthlyAvg = housescomplete[['buurtnaam_2019', 'wijknaam_2019k_naam', 'gemeentenaam2019', 'SellingMonth', 'koopprijs']]

#Change column names
BuurtMonthlyAvg.columns.values[0] = "Neighborhood" #Change colname to english
BuurtMonthlyAvg.columns.values[1] = "District" #Change colname to english
BuurtMonthlyAvg.columns.values[2] = "Municipality" #Change colname to english
BuurtMonthlyAvg.columns.values[3] = "Month" #Change colname to english
BuurtMonthlyAvg.columns.values[4] = "Average Selling Price" #Change colname to english

#Groupby
BuurtMonthlyAvg = BuurtMonthlyAvg.groupby(['Neighborhood', 'District', 'Municipality', 'Month']).mean().reset_index()

#Get month names from month
BuurtMonthlyAvg.loc[BuurtMonthlyAvg['Month'] == '01', 'Month'] = 'January'
BuurtMonthlyAvg.loc[BuurtMonthlyAvg['Month'] == '02', 'Month'] = 'February'
BuurtMonthlyAvg.loc[BuurtMonthlyAvg['Month'] == '03', 'Month'] = 'March'
BuurtMonthlyAvg.loc[BuurtMonthlyAvg['Month'] == '04', 'Month'] = 'April'
BuurtMonthlyAvg.loc[BuurtMonthlyAvg['Month'] == '05', 'Month'] = 'May'
BuurtMonthlyAvg.loc[BuurtMonthlyAvg['Month'] == '06', 'Month'] = 'June'
BuurtMonthlyAvg.loc[BuurtMonthlyAvg['Month'] == '07', 'Month'] = 'July'
BuurtMonthlyAvg.loc[BuurtMonthlyAvg['Month'] == '08', 'Month'] = 'August'
BuurtMonthlyAvg.loc[BuurtMonthlyAvg['Month'] == '09', 'Month'] = 'September'
BuurtMonthlyAvg.loc[BuurtMonthlyAvg['Month'] == '10', 'Month'] = 'October'
BuurtMonthlyAvg.loc[BuurtMonthlyAvg['Month'] == '11', 'Month'] = 'November'
BuurtMonthlyAvg.loc[BuurtMonthlyAvg['Month'] == '12', 'Month'] = 'December'

#Percentage change between rows
BuurtMonthlyAvg['Percentage Change'] = ((BuurtMonthlyAvg['Average Selling Price'].pct_change())*100).round(2)

#Remove NA
BuurtMonthlyAvg['Percentage Change'] = BuurtMonthlyAvg['Percentage Change'].fillna(0)

#Round Avg Selling Price to 2 decimals
BuurtMonthlyAvg['Average Selling Price'] = BuurtMonthlyAvg['Average Selling Price'].round(2)

#Overview
BuurtMonthlyAvg.to_sql('BuurtMonthlyAvgPctChange', engine) ############################

Difference median between months for muni, district and neighborhood

In [74]:
#Create new datafram which groups by municipality and selling month and calculates the median selling price per month
MunicMonthlyMedian = housescomplete[['gemeentenaam2019', 'SellingMonth', 'koopprijs']]
MunicMonthlyMedian = MunicMonthlyMedian.groupby(['gemeentenaam2019', 'SellingMonth']).median().reset_index()

#Change column names where needed
MunicMonthlyMedian.columns.values[0] = "Municipality" #Change colname to english
MunicMonthlyMedian.columns.values[1] = "Month" #Change colname to english
MunicMonthlyMedian.columns.values[2] = "Median Selling Price" #Change colname to english

#Get month names from month numbers
MunicMonthlyMedian.loc[MunicMonthlyMedian['Month'] == '01', 'Month'] = 'January'
MunicMonthlyMedian.loc[MunicMonthlyMedian['Month'] == '02', 'Month'] = 'February'
MunicMonthlyMedian.loc[MunicMonthlyMedian['Month'] == '03', 'Month'] = 'March'
MunicMonthlyMedian.loc[MunicMonthlyMedian['Month'] == '04', 'Month'] = 'April'
MunicMonthlyMedian.loc[MunicMonthlyMedian['Month'] == '05', 'Month'] = 'May'
MunicMonthlyMedian.loc[MunicMonthlyMedian['Month'] == '06', 'Month'] = 'June'
MunicMonthlyMedian.loc[MunicMonthlyMedian['Month'] == '07', 'Month'] = 'July'
MunicMonthlyMedian.loc[MunicMonthlyMedian['Month'] == '08', 'Month'] = 'August'
MunicMonthlyMedian.loc[MunicMonthlyMedian['Month'] == '09', 'Month'] = 'September'
MunicMonthlyMedian.loc[MunicMonthlyMedian['Month'] == '10', 'Month'] = 'October'
MunicMonthlyMedian.loc[MunicMonthlyMedian['Month'] == '11', 'Month'] = 'November'
MunicMonthlyMedian.loc[MunicMonthlyMedian['Month'] == '12', 'Month'] = 'December'

#Difference change between rows
MunicMonthlyMedian['Difference Previous Month'] = MunicMonthlyMedian['Median Selling Price'].diff().round(2)

#Remove NA
MunicMonthlyMedian['Difference Previous Month'] = MunicMonthlyMedian['Difference Previous Month'].fillna(0)

#Overview
MunicMonthlyMedian.to_sql('MunicMonthlyMedianDiff', engine) ############################


#Create new datafram which groups by municipality, district and selling month and calculates the median selling price per month
DistMonthlyMedian = housescomplete[['wijknaam_2019k_naam', 'gemeentenaam2019', 'SellingMonth', 'koopprijs']]

#Change column names
DistMonthlyMedian.columns.values[0] = "District" #Change colname to english
DistMonthlyMedian.columns.values[1] = "Municipality" #Change colname to english
DistMonthlyMedian.columns.values[2] = "Month" #Change colname to english
DistMonthlyMedian.columns.values[3] = "Median Selling Price" #Change colname to english

#Groupby
DistMonthlyMedian = DistMonthlyMedian.groupby(['District', 'Municipality', 'Month']).median().reset_index()

#Get month names from month
DistMonthlyMedian.loc[DistMonthlyMedian['Month'] == '01', 'Month'] = 'January'
DistMonthlyMedian.loc[DistMonthlyMedian['Month'] == '02', 'Month'] = 'February'
DistMonthlyMedian.loc[DistMonthlyMedian['Month'] == '03', 'Month'] = 'March'
DistMonthlyMedian.loc[DistMonthlyMedian['Month'] == '04', 'Month'] = 'April'
DistMonthlyMedian.loc[DistMonthlyMedian['Month'] == '05', 'Month'] = 'May'
DistMonthlyMedian.loc[DistMonthlyMedian['Month'] == '06', 'Month'] = 'June'
DistMonthlyMedian.loc[DistMonthlyMedian['Month'] == '07', 'Month'] = 'July'
DistMonthlyMedian.loc[DistMonthlyMedian['Month'] == '08', 'Month'] = 'August'
DistMonthlyMedian.loc[DistMonthlyMedian['Month'] == '09', 'Month'] = 'September'
DistMonthlyMedian.loc[DistMonthlyMedian['Month'] == '10', 'Month'] = 'October'
DistMonthlyMedian.loc[DistMonthlyMedian['Month'] == '11', 'Month'] = 'November'
DistMonthlyMedian.loc[DistMonthlyMedian['Month'] == '12', 'Month'] = 'December'

#Difference change between rows
DistMonthlyMedian['Difference Previous Month'] = DistMonthlyMedian['Median Selling Price'].diff().round(2)

#Remove NA
DistMonthlyMedian['Difference Previous Month'] = DistMonthlyMedian['Difference Previous Month'].fillna(0)

#Overview
DistMonthlyMedian.to_sql('DistMonthlyMedianDiff', engine) ############################



#Create new datafram which groups by municipality, district and neighborhood and selling month and calculates the mean selling price per month
BuurtMonthlyMedian = housescomplete[['buurtnaam_2019', 'wijknaam_2019k_naam', 'gemeentenaam2019', 'SellingMonth', 'koopprijs']]

#Change column names
BuurtMonthlyMedian.columns.values[0] = "Neighborhood" #Change colname to english
BuurtMonthlyMedian.columns.values[1] = "District" #Change colname to english
BuurtMonthlyMedian.columns.values[2] = "Municipality" #Change colname to english
BuurtMonthlyMedian.columns.values[3] = "Month" #Change colname to english
BuurtMonthlyMedian.columns.values[4] = "Median Selling Price" #Change colname to english

#Groupby
BuurtMonthlyMedian = BuurtMonthlyMedian.groupby(['Neighborhood', 'District', 'Municipality', 'Month']).median().reset_index()

#Get month names from month
BuurtMonthlyMedian.loc[BuurtMonthlyMedian['Month'] == '01', 'Month'] = 'January'
BuurtMonthlyMedian.loc[BuurtMonthlyMedian['Month'] == '02', 'Month'] = 'February'
BuurtMonthlyMedian.loc[BuurtMonthlyMedian['Month'] == '03', 'Month'] = 'March'
BuurtMonthlyMedian.loc[BuurtMonthlyMedian['Month'] == '04', 'Month'] = 'April'
BuurtMonthlyMedian.loc[BuurtMonthlyMedian['Month'] == '05', 'Month'] = 'May'
BuurtMonthlyMedian.loc[BuurtMonthlyMedian['Month'] == '06', 'Month'] = 'June'
BuurtMonthlyMedian.loc[BuurtMonthlyMedian['Month'] == '07', 'Month'] = 'July'
BuurtMonthlyMedian.loc[BuurtMonthlyMedian['Month'] == '08', 'Month'] = 'August'
BuurtMonthlyMedian.loc[BuurtMonthlyMedian['Month'] == '09', 'Month'] = 'September'
BuurtMonthlyMedian.loc[BuurtMonthlyMedian['Month'] == '10', 'Month'] = 'October'
BuurtMonthlyMedian.loc[BuurtMonthlyMedian['Month'] == '11', 'Month'] = 'November'
BuurtMonthlyMedian.loc[BuurtMonthlyMedian['Month'] == '12', 'Month'] = 'December'

#Difference change between rows
BuurtMonthlyMedian['Difference Previous Month'] = BuurtMonthlyMedian['Median Selling Price'].diff().round(2)

#Remove NA
BuurtMonthlyMedian['Difference Previous Month'] = BuurtMonthlyMedian['Difference Previous Month'].fillna(0)

#Overview
BuurtMonthlyMedian.to_sql('BuurtMonthlyMedianDiff', engine) ############################

Price per population density for municipality and district

In [78]:
#Perform join population_density_zip_clean and houses
housespopdensity = pd.merge(houses, population_density_zip_clean, left_on = 'postcode', right_on = 'Postcode', how = 'inner')
housespopdensity['koopprijs'] = pd.to_numeric(housespopdensity['koopprijs'])

#Overview of dataset
housespopdensity #206115 rows, houses contains 211617 rows -> 5502 rows lost becuase of missing data.

#decide on bins
#first have a look into the difference between min and max of density
maxValue = population_density_zip_clean['population_density_Municipality'].max()
minValue = population_density_zip_clean['population_density_Municipality'].min()
print(maxValue, minValue)

#Decide on bins for municipality
#difference between bins = (difference min and max / number of group_names) = 2135
binsmunc = [55, 2191, 4326, 6459]

#names for the three groups
group_names_munc = ['low density', 'medium density', 'high density']

#finally discritize density Municipality
housespopdensity['density_group_M'] = pd.cut(housespopdensity['population_density_Municipality'], binsmunc, labels=group_names_munc)
housespopdensity



#decide on bins District
#first have a look into the difference between min and max of density
maxValue = population_density_zip_clean['population_density_District'].max()
minValue = population_density_zip_clean['population_density_District'].min()
print(maxValue, minValue)

#difference between bins = (difference min and max / number of group_names) = 9380
binsdist = [0, 9380, 18760, 28139]

#names for the three groups
group_names_dist = ['low density', 'medium density', 'high density']

#finally discritize density
housespopdensity['density_group_D'] = pd.cut(housespopdensity['population_density_District'], binsdist, labels=group_names_dist)



Avg_munic_density = housespopdensity[['density_group_M', 'Municipality', 'koopprijs']] #Selects a categorical and a numerical column from the dataset DATA (as mentioned above)
Avg_munic_density = Avg_munic_density.groupby(['Municipality', 'density_group_M']).mean().reset_index()
Avg_munic_density = Avg_munic_density.dropna()

#Change column names
Avg_munic_density.columns.values[1] = "Municipality" #Change colname to english
Avg_munic_density.columns.values[1] = "Density Group" #Change colname to english
Avg_munic_density.columns.values[2] = "Average Selling Price Municipality" #Change colname to english

Avg_munic_density.to_sql('AvgPricePopDensityMunic', engine) ###############################



#Calculation Avg. price per density per District
Avg_dist_density = housespopdensity[['density_group_D', 'District', 'Municipality', 'koopprijs']] #Selects a categorical and a numerical column from the dataset DATA (as mentioned above)
Avg_dist_density = Avg_dist_density.groupby(['District', 'Municipality', 'density_group_D']).mean().reset_index()
Avg_dist_density = Avg_dist_density.dropna()

#Change column names
Avg_dist_density.columns.values[0] = "District" #Change colname to english
Avg_dist_density.columns.values[1] = "Municipality" #Change colname to english
Avg_dist_density.columns.values[2] = "Density Group" #Change colname to english
Avg_dist_density.columns.values[3] = "Average Selling Price District" #Change colname to english

Avg_dist_density.to_sql('AvgPricePopDensityDist', engine) ###############################


6459 56
28139 0
