## import libraries

In [None]:
# import modules
import geopandas as gpd
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.patches as mpatches
import cartopy.crs as ccrs
import pandas as pd
import geopandas as gpd
from scipy.stats import zscore
import rasterio as rio

In [None]:
#makes plot interactive
plt.ion()

# 1)  Social Indicator

Loads & clean data 

In [None]:
# loads the  data to calculate the social indicator from local machine
admin = gpd.read_file("C:/Users/charl/OneDrive/Desktop/ulster/EGM722_programming/git/egm722_project/data/Stadtteile_Hamburg.shp")
stats = pd.read_csv("C:/Users/charl/OneDrive/Desktop/ulster/EGM722_programming/git/egm722_project/data/statistics_HH21.csv",encoding="utf-8", delimiter=";",decimal="." )
#admin.head()


In [None]:
#renames the columns to english
stats.columns = ['district','inhabitants', 'pop<18','%<18', 'pop>65y','%>65', 'foreignResidents', 'migration_backg', 'hh','pp_size','hh_kids', '%hh_kids','areakm2', 'pop_density','working_pop', '%working_pop', 'unemployed','%unemployed','unemployed<18','%unemployed<18', 'unemployed>65', '%unemployed>65','social_benefits', '%social_benefits','social_housing', '%social_housing']
#stats


In [None]:
#joins the population data with the shapefile of administrative boundaries and dropped unecessary columns in the admin dataset
admin_pop = pd.merge(admin,stats, left_on='stadtteil_', right_on='district', how = 'inner')

admin_pop.drop(columns=['OBJECTID','bezirk', 'stadttei_1', 'stadttei_2','pp_size', '%<18', 'hh','foreignResidents', 'migration_backg','pop<18','hh_kids', '%hh_kids', 'pop>65y', 'unemployed<18', ],axis=1, inplace=True)

#admin_pop

In [None]:
# this line of code first subsets the dataframe so that only numeric columns relevant to the further anylsis (social status) are maintained
# null values are droppped to enable calculation
social_stats = admin_pop[['%unemployed','%social_benefits','%social_housing','%unemployed>65' ]].dropna()
#code then checks standard deviation 
social_stats.hist()
social_stats.std()

In [None]:
# The scipy Library in Python is used to calculate the z-score which helps to standardize the values for each column 

z_scores_admin_pop = social_stats[['%unemployed','%social_benefits','%social_housing','%unemployed>65']].apply(zscore)

print(z_scores_admin_pop.std()) # shows the std after calculation of z-score

#### Classification of social statistics layer

In [None]:
#groups & classifies the results into equal percentiles 
# Quantiles from 0-4 (e.g. 0= lowest 20th quantile of mean  to 4 = within highest 20th quantile))
#and saves the results to a new column. All the 4 indicators are equally weighted. 
z_scores_admin_pop['qt_soc_stats'] = pd.qcut(z_scores_admin_pop['%unemployed'] + z_scores_admin_pop['%social_benefits'] + z_scores_admin_pop['%social_housing'] + z_scores_admin_pop['%unemployed>65'], 5, labels=[0, 1, 2, 3, 4])


In [None]:
#The function adds a text column to the dataframe based on the classification into percentiles
#The function will be used for all of the different indicators)
def add_status_column(df, zscore_mean_column):
    df['status'] = ['very low' if x == 0 else  #creates the new column "status and then adds text description to the corresponding value"
                   'low' if x == 1 else
                   'medium' if x == 2 else
                   'high' if x == 3 else
                   'very high' for x in df[zscore_mean_column]]

# Applies the function to the data set based on the values in the stats column
add_status_column(z_scores_admin_pop, 'qt_soc_stats') 

In [None]:
z_scores_admin_pop.head() # checks if the operation has worked

In [None]:
#merges the social statistics  to the admin data frame to be able to plot the data later 
admin_socstats = pd.merge(admin_pop, z_scores_admin_pop, left_index=True, right_index=True) #Uses the index to perform the merge operation
admin_socstats.head()

# 2) Environmental Indicators
Uses noise, green areas, surface temperature? as environmental indicator)

In [None]:
#loads the data for analysis of environmental indicators
noise = gpd.read_file("C:/Users/charl/OneDrive/Desktop/ulster/EGM722_programming/git/egm722_project/data/Laermkarten_HH_2018-11-19.shp")
buildings = gpd.read_file("C:/Users/charl/OneDrive/Desktop/ulster/EGM722_programming/git/egm722_project/data/Gebaeude_Hamburg.shp")
green_areas = gpd.read_file("C:/Users/charl/OneDrive/Desktop/ulster/EGM722_programming/git/egm722_project/data/Oeffentliche_Gruenanlage_Hamburg.shp")

In [None]:
# the function checks the crs of different layers and reprojects them to the target EPSG for Hamburg, GE(25832) for spatial operations
def reproject_to_local_epsg(gdf):
    # Checks the CRS of the input GeoDataFrame
    print('Original CRS:', gdf.crs)

    # Reprojects the GeoDataFrame to EPSG 25832
    gdf_reproj = gdf.to_crs(epsg=25832)

    # Prints out the CRS of the output GeoDataFrame
    print('Reprojected CRS:', gdf_reproj.crs)

    return gdf_reproj

# applies function to all the data  to convert CRS
admin_pop_reproj = reproject_to_local_epsg(admin_pop)
noise_reproj = reproject_to_local_epsg(noise)
housing_reproj = reproject_to_local_epsg(buildings)
green_areas_reproj = reproject_to_local_epsg(green_areas)


## a)  Green areas - access to and distribution
Analyses distribution of green areas per district and area / inhabitant

In [None]:
#green_areas_reproj.columns

In [None]:
#deletes the values with code 10 = playground according to ALKIS key(official cadastre information system
green_areas_reproj = green_areas_reproj[green_areas_reproj.nutzung != 10]

#drops unimportant columns from the dataset
green_areas_reproj.drop(columns=['veroeffent', 'nutzung', 'nutz_code' ,'herrichtun', 'gesamtanla','aktualitae','idnr','belegenh_1','belegenhei','quelle_dat', 'stand'],axis=1, inplace=True)

green_areas_reproj.head()

#### Calculation of statistics of green areas at district level

In [None]:
# Calculates the sum of the area  statistics for the green areas per district (district = stadtteil)
green_area_sum = green_areas_reproj.groupby(['stadtteil'])['flaeche_ha'].sum().reset_index() # groups the dataset
green_area_sum.rename(columns={'flaeche_ha': 'green_area_total_ha'}, inplace=True) # renames the columns

# Calculates mean green space area per district
green_area_mean = green_areas_reproj.groupby(['stadtteil'])['flaeche_ha'].mean().reset_index() 
green_area_mean.rename(columns={'flaeche_ha': 'green_area_mean_ha'}, inplace=True)

# Calculates the count of green spaces per district=  "Bennenung" = unique name of green area
green_space_count = green_areas_reproj.groupby(['stadtteil'])['benennung'].count().reset_index()
green_space_count.rename(columns={'benennung': 'green_space_count'}, inplace=True)

# Combines the statistics into one data frame using the stadtteil column
green_stats = pd.merge(green_area_sum, green_area_mean, on='stadtteil')
green_stats = pd.merge(green_stats, green_space_count, on='stadtteil')


In [None]:
# Calculates the percentage of green space from total area of the district
green_stats['perc_green_area'] = green_areas_reproj['flaeche_ha'] / green_areas_reproj['geometry'].area * 100
green_stats.head()

In [None]:
# Finally calculates the area per inhabitant by using the column inhabitants from the admin_pop_reproj dataframe
# adds the column to the green_stats data frame
green_stats = green_stats.merge(admin_pop_reproj[['stadtteil_', 'inhabitants']], left_on='stadtteil', right_on='stadtteil_', how='left')
green_stats.drop('stadtteil_', axis=1, inplace=True)

In [None]:
#calculates thearea / inhabitant and saves the results in a new column
green_stats['area_per_inhbt'] = green_stats['inhabitants'] / green_stats['green_area_total_ha']
green_stats.head()

In [None]:
#checks std of relevant values
green_stats[['area_per_inhbt','perc_green_area' ]].std()


#### Classification of green area layer

In [None]:
# the function classifies the statistics into 5 equal percentiles. The function will be used for all the environmental variables 
# uses the same approach as for the social indicator: quantiles 0-4 (e.g. 0= lowest 20th quantile of mean / to  - 4 within highest 20th quantile)
def percentile_5(gpd, col_name, percentiles): 
    #defines the name of the new column to save the results , then classifies the dataset into 5 equal percentiles 0-4
    gpd[col_name + '_percentile']  = pd.qcut(gpd[col_name], percentiles, labels=[0, 1, 2, 3, 4]) # col_name= column used for classification

    return gpd

In [None]:
#applies the function to classify the green_stats data into equal percentiles based on 'area_per_inhbt' column
percentile_5(green_stats, 'area_per_inhbt',5)

In [None]:
# applies the add status column function defined earlier based on the classified "percentile column"
add_status_column(green_stats, 'area_per_inhbt_percentile')
green_stats.head()

In [None]:
#merges the green statistics computed to the reprojected admin data frame to be able to plot the data later on
admin_greenstats = pd.merge(admin_pop_reproj, green_stats, left_index=True, right_index=True)
admin_greenstats.head()

## b) Noise Pollution 
For the noise indicator the assessment aims to identify the area of houses affected by noise and noiseclass /  district  

Analysis, filter and cleaning of the Housing Dataset

In [None]:
#housing_reproj.columns
#checks data type of column gebaeudefu which indicates usage of houses 
housing_reproj['gebaeudefu'] = housing_reproj['gebaeudefu'].astype(int)# converts the column to integer to enable calculations

#drops uninmportant columns in the dataset
housing_reproj.drop(columns=['anzahlDerU', 'lageZurErd', 'dachart', 'SHAPE_Leng'],axis=1, inplace=True)


In [None]:
# filters out any non-residential housing from the data set and keeps only residential buildings ALKIS Keys (1000 and 1010 )
housing_reproj = housing_reproj[(housing_reproj['gebaeudefu'] == 1010) | (housing_reproj['gebaeudefu'] == 1000)]
#housing_reproj.head()

Analysis, filter and cleaning of the Noise Dataset

In [None]:
###checks out the noise data set, which is a classified shapefile:name here stands for severity of noise from 0 low to 4 highest
noise_reproj.head()  

Joins the Housing Data set to the Noise Data set

In [None]:
# the code uses the geopandas overlay method and interesect operation to determine where the noise layer intersects with the house layer 
houses_noise = gpd.overlay(housing_reproj, noise_reproj, how='intersection')
#houses_noise.head()

In [None]:
#renames column names to noiseclass and anzahDerO to English = number of floors and grundflaec to house_area
houses_noise = houses_noise.rename(columns={'name': 'noiseclass', 'anzahlDerO': 'floors', 'grundflaec':'house_area'})

In [None]:
# joins the data frame to the admin dataset to be able to plot the data later
admin_noisestats = gpd.sjoin(admin_pop_reproj, houses_noise)

In [None]:
#admin_noisestats.columns

#### Calculation of Statistics

In [None]:
#groups the dataset to get the number of houses per noiseclass and district
houses_noiseclass = admin_noisestats.groupby(['district', 'noiseclass'])['OBJECTID'].count().reset_index()
houses_noiseclass.head()

In [None]:
# mulitplies the number of houses with the house area 
admin_noisestats['total_area'] = admin_noisestats['OBJECTID'] * admin_noisestats['house_area']

In [None]:
# multiplies the house area with the number of floors to get the total affected residential housing area by noiseclass
admin_noisestats['area_floors']=admin_noisestats['total_area']*admin_noisestats['floors'].astype(int)

In [None]:
#admin_noisestats['total_area_floors'].hist()
#admin_noisestats.head()

In [None]:
# converts the noiseclass to an integer
admin_noisestats['noiseclass'] = admin_noisestats['noiseclass'].astype(int)

In [None]:
#weights the area by noiseclass in view of severity from low to high (increase by 0.5)
weights = {0: 0.5, 1: 1, 2: 1.5, 3: 2, 4: 2.5} # dictionary to define the weights
#applies the weights and saves the results in a new column
#uses  the pandas.map function to assign the weights
admin_noisestats['weighted_area'] = admin_noisestats['area_floors'] * admin_noisestats['noiseclass'].map(weights) 


In [None]:
#groups data set by district and weighted area
noisestats_weighted = admin_noisestats.groupby('district')['weighted_area'].sum().reset_index()
#admin_noisestats.head()

In [None]:
#applies function thats classifies dataframe   into percentiles
percentile_5(noisestats_weighted, 'weighted_area',5)

In [None]:
# applies the add status column function defined earlier
add_status_column(noisestats_weighted, 'weighted_area_percentile')
noisestats_weighted.head()

In [None]:
#merges the grouped data set with the weighted area with the orignial merged noisedata to prepare for plotting
noisestats_merged = pd.merge(admin_noisestats, noisestats_weighted, left_index=True, right_index=True)
noisestats_merged.head()
noisestats_merged = noisestats_merged.rename(columns={'district_x': 'district'}) # renames the column to use it in the plot function later


In [None]:
#saves the final data pollution statistics as csv file into the ouptut folder on Github

In [None]:
noisestats_merged.head()


## Visualizations of Indicators
Create maps and plots for the different indicators

Saves the dataframes with the calculations in csv format to output folder in GITHB

In [None]:
#saves the final data frames as csv file into the ouptut folder on Github

admin_socstats.to_csv("./output/social_stats.csv")
noisestats_merged.to_csv("./output/noise_stats.csv")
admin_greenstats.to_csv("./output/green_stats.csv")

In [None]:
# function to define parameters for plotting od indicators
def plot_stats(gdf, title, color ): # defines the parameterrs that need to be adapted for the individual plots
    # Create a figure and axis object with a size of 12x16 inches
    fig, ax = plt.subplots(figsize=(10, 10))

    # Plots the data using the the status column
    gdf.plot("status", cmap=color, legend=True, ax=ax) # shows the legend
    
    legend = ax.get_legend()
    legend.set_title("Status") # adds the legend title

    # Sets the title of the plot
    ax.set_title(title,fontweight="bold")
    
    #adds the district name to the plot and defines some style parameters
    gdf.apply(lambda x: ax.annotate(text=x['district'], xy=x.geometry.centroid.coords[0], ha='center', color="indigo", size=6), axis=1);
 
    # Turns off the axis labels
    ax.set_axis_off()
    
    # Displays the plot
    plt.show()

In [None]:
## applies the function to the individual indicators created 
plot_stats(noisestats_merged, "Noise Pollution - Percentiles", "Blues")


In [None]:
plot_stats(admin_socstats, "Social Status - Percentiles", "Oranges")


In [None]:
#plot_stats(admin_greenstats, "Green Areas per Inhabitant - Percentiles", "YlGn")