## import libraries

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.patches as mpatches
import cartopy.crs as ccrs
import pandas as pd
import geopandas as gpd
from scipy.stats import zscore

In [None]:
#make plot interactive
plt.ion()

# 1)  Social Indicator

### Load & clean data 

In [None]:
# load the necessary data here 
admin = gpd.read_file("C:/Users/charl/OneDrive/Desktop/ulster/EGM722_programming/git/egm722_project/data/Stadtteile_Hamburg.shp")
stats = pd.read_csv("C:/Users/charl/OneDrive/Desktop/ulster/EGM722_programming/git/egm722_project/data/statistics_HH21.csv",encoding="utf-8", delimiter=";",decimal="." )
#admin
#print(stats.head())

In [None]:
#rename columns
stats.columns = ['district','inhabitants', 'pop<18','%<18', 'pop>65y','%>65', 'foreignResidents', 'migration_backg', 'hh','pp_size','hh_kids', '%hh_kids','areakm2', 'pop_density','working_pop', '%working_pop', 'unemployed','%unemployed','unemployed<18','%unemployed<18', 'unemployed>65', '%unemployed>65','social_benefits', '%social_benefits','social_housing', '%social_housing']
#stats


In [None]:
#joined population data with shapefile of admin boundaries and dropped additional columns in the admin dataset
admin_pop = pd.merge(admin,stats, left_on='stadtteil_', right_on='district', how = 'inner')

admin_pop.drop(columns=['OBJECTID','bezirk', 'stadttei_1', 'stadttei_2','pp_size', '%<18', 'hh','foreignResidents', 'migration_backg','pop<18','hh_kids', '%hh_kids', 'pop>65y', 'unemployed<18', ],axis=1, inplace=True)

#admin_pop
#fig, ax = plt.subplots(figsize=(24, 18))
#admin_pop.plot(ax=ax, alpha=1,linewidth=2, facecolor="none")

In [None]:
# checks the number of null values and drops the null values in the colums used for calculation of social status
admin_pop.isnull().mean() * 100
admin_pop.dropna(subset=['%unemployed','%social_benefits','%social_housing','%unemployed>65' ], inplace=True)

#admin_pop.dropna(inplace=True)
#admin_pop.plot()
admin_pop.head()

In [None]:
# this line of code first subsets the dataframe so that only numeric columns relevant to the further anylsis (social status) are maintained
# code than checks standard deviation 
social_stats = admin_pop[['%unemployed','%social_benefits','%social_housing','%unemployed>65' ]]#std()

social_stats.hist()
social_stats.std()

In [None]:
# The scipy Library in Python is used to calculate the z-score which helps to standardize the values for each column 

z_scores_admin_pop = admin_pop[['%unemployed','%social_benefits','%social_housing','%unemployed>65']].apply(zscore)

print(z_scores_admin_pop.std())

In [None]:
#z_scores_admin_pop

### Analysis

In [None]:
#groups / classifies the results into equal percentiles of the respective column (equal quantiles 0 -4 (e.g. 0= lowest 20th quantile of mean / to  - 4 within highest 20th quantile))
#and saves the results to a new column. all the 4 indicators are equally weighted. no difference has been made
z_scores_admin_pop['qt_soc_stats'] = pd.qcut(z_scores_admin_pop['%unemployed'] + z_scores_admin_pop['%social_benefits'] + z_scores_admin_pop['%social_housing'] + z_scores_admin_pop['%unemployed>65'], 5, labels=[0, 1, 2, 3, 4])


In [None]:
#function to add a text column to the data frame based on the classification into percentiles (will be applied to the  different indicators)
def add_status_column(df, zscore_mean_column):
    df['status'] = ['very low' if x == 0 else  #creates the new column "status and then adds text description to the corresponding value"
                   'low' if x == 1 else
                   'medium' if x == 2 else
                   'high' if x == 3 else
                   'very high' for x in df[zscore_mean_column]]
    
add_status_column(z_scores_admin_pop, 'qt_soc_stats')


In [None]:
#apply function to add status column
add_status_column(z_scores_admin_pop, 'qt_soc_stats')

In [None]:
z_scores_admin_pop.head()

In [None]:
#merges the social statistics computed to the admin data frame to be able to plot the data later on
admin_socstats = pd.merge(admin_pop, z_scores_admin_pop, left_index=True, right_index=True)
admin_socstats.head()

In [None]:
#save the data frame as csv file
#admin_socstats.to_csv("./output/social_stats.csv")

### VISUALIZATION

In [None]:
#plot output
admin_socstats.plot("status", cmap="Reds", legend=True)  

# 2) Environmental Indicators
uses noise, green areas, surface temperature? as indicator)

In [None]:
#load the data
noise = gpd.read_file("C:/Users/charl/OneDrive/Desktop/ulster/EGM722_programming/git/egm722_project/data/Laermkarten_HH_2018-11-19.shp")
buildings = gpd.read_file("C:/Users/charl/OneDrive/Desktop/ulster/EGM722_programming/git/egm722_project/data/Gebaeude_Hamburg.shp")
green_areas = gpd.read_file("C:/Users/charl/OneDrive/Desktop/ulster/EGM722_programming/git/egm722_project/data/Oeffentliche_Gruenanlage_Hamburg.shp")

In [None]:
# function to checks crs of different layers and reproject to epgs:Hamburg, GE to prepare for joining later on EPSG:25832
def reproject_to_local_epsg(gdf):
    # Checks the CRS of the input GeoDataFrame
    print('Original CRS:', gdf.crs)

    # Reprojects the GeoDataFrame to EPSG 25832
    gdf_reproj = gdf.to_crs(epsg=25832)

    # Prints out the CRS of the output GeoDataFrame
    print('Reprojected CRS:', gdf_reproj.crs)

    return gdf_reproj

# apply function all of data set to convert crs
admin_pop_reproj = reproject_to_local_epsg(admin_pop)
noise_reproj = reproject_to_local_epsg(noise)
housing_reproj = reproject_to_local_epsg(buildings)
green_areas_reproj = reproject_to_local_epsg(green_areas)


## a) Green areas 

In [None]:
#green_areas_reproj.columns

In [None]:
#for green areas delete the values with code 10 - playground according to ALKIS the official cadastre information system) key and drop some of the unimportant columns
green_areas_reproj = green_areas_reproj[green_areas_reproj.nutzung != 10]

#drop unimportant columns from the dataset
green_areas_reproj.drop(columns=['veroeffent', 'nutzung', 'nutz_code' ,'herrichtun', 'gesamtanla','aktualitae','idnr','belegenh_1','belegenhei','quelle_dat', 'stand'],axis=1, inplace=True)

green_areas_reproj.head()

In [None]:
#green_areas_reproj.plot()

#### Calculation of statistics of green areas at district level

In [None]:
# Calculates the sum of the area  statistics for the green areas per district (district = stadtteil)
green_area_sum = green_areas_reproj.groupby(['stadtteil'])['flaeche_ha'].sum().reset_index()
green_area_sum.rename(columns={'flaeche_ha': 'green_area_total_ha'}, inplace=True)

# Calculates mean green space area per district
green_area_mean = green_areas_reproj.groupby(['stadtteil'])['flaeche_ha'].mean().reset_index()
green_area_mean.rename(columns={'flaeche_ha': 'green_area_mean_ha'}, inplace=True)

# Calculates the count of green spaces per district=  bennenung = unique name of green area
green_space_count = green_areas_reproj.groupby(['stadtteil'])['benennung'].count().reset_index()
green_space_count.rename(columns={'benennung': 'green_space_count'}, inplace=True)

# Combines the statistics into one data frame
green_stats = pd.merge(green_area_sum, green_area_mean, on='stadtteil')
green_stats = pd.merge(green_stats, green_space_count, on='stadtteil')


In [None]:
# Calculates the percentage of green space from total area of the district
green_stats['perc_green_area'] = green_areas_reproj['flaeche_ha'] / green_areas_reproj['geometry'].area * 100
green_stats.head()

In [None]:
# Finally calculates the area per inhabitant by using the column inhabitants from the admin_pop_reproj dataframe
# add the column to the green_stats data frame
green_stats = green_stats.merge(admin_pop_reproj[['stadtteil_', 'inhabitants']], left_on='stadtteil', right_on='stadtteil_', how='left')
green_stats.drop('stadtteil_', axis=1, inplace=True)

In [None]:
#calculate area / inhabitants 
green_stats['area_per_inhbt'] = green_stats['inhabitants'] / green_stats['green_area_total_ha']
green_stats.head()

In [None]:
#check std of relevant values
#green_stats = green_stats[['area_per_inhbt','perc_green_area' ]].std()
#green_stats.std()

In [None]:
#groups / classifies the results into equal percentiles of the respective column (equal quantiles 0 -4 (e.g. 0= lowest 20th quantile of mean / to  - 4 within highest 20th quantile))
#and saves the results to a new column
green_stats['qt_stats_area_inhbt'] = pd.qcut(green_stats ['area_per_inhbt'],5,labels=[0, 1, 2, 3, 4])


In [None]:
# apply the add column function defined earlier
add_status_column(green_stats, 'qt_stats_area_inhbt')
green_stats.head()

In [None]:
#save green stats to csv file


In [None]:
#merges the green statistics computed to the reprojected admin data frame to be able to plot the data later on
admin_greenstats = pd.merge(admin_pop_reproj, green_stats, left_index=True, right_index=True)
admin_greenstats.head()

### Quick Visualization green areas

In [None]:
#plot output
admin_greenstats.plot("status", cmap="Greens", legend=True)  

## b) Noise

In [None]:
#for the noise indicator. the assessment aims to identify houses affected by noise and class 

In [None]:
#housing_reproj.columns

In [None]:
#checks data type of column gebaeudefu which indicates usage of houses (e.g. commercial, resnidential) and than changes it  to integer and drops unimportant columns in the data frame
housing_reproj['gebaeudefu'] = housing_reproj['gebaeudefu'].astype(int)

#print(housing_reproj['gebaeudefu'].dtype)
housing_reproj.drop(columns=['anzahlDerU', 'lageZurErd', 'dachart', 'SHAPE_Leng'],axis=1, inplace=True)


In [None]:
# filter out any non-residential housing from data set according to ALKIS Key (1000 and 1010 )
housing_reproj = housing_reproj[(housing_reproj['gebaeudefu'] == 1010) | (housing_reproj['gebaeudefu'] == 1000)]#housing_reproj.head()

In [None]:
###checks out the noise data setwhich is a classified shapefile:  name here stands for severity of noise from 0 low to 4 highest
noise_reproj.head()  

In [None]:
# the code uses the geopandas overlay method and interesect operation to determine where the noise layer intersects with the house layer to determine the houses affectected by noise
houses_noise = gpd.overlay(housing_reproj, noise_reproj, how='intersection')
#houses_noise.head()

In [None]:
#renamed columns name to noise class and anzahDerO to english = number of floors
houses_noise = houses_noise.rename(columns={'name': 'noiseclass', 'anzahlDerO': 'floors', 'grundflaec':'house_area'})

In [None]:
# join the data frame to the admin_pop_reproj to be able to plot it later and get affected houses / district
admin_noisestats = gpd.sjoin(admin_pop_reproj, houses_noise)

In [None]:
#admin_noisestats.columns

In [None]:
#gets the number of houses per noise class and district
houses_noiseclass = admin_noisestats.groupby(['district', 'noiseclass'])['OBJECTID'].count().reset_index()
houses_noiseclass.head()

In [None]:
# mulitplies the number of houses with the house area 
admin_noisestats['total_area'] = admin_noisestats['OBJECTID'] * admin_noisestats['house_area']

In [None]:
# multiplies the house area with the number of floors to get the total affected residential housing area by noiseclass
admin_noisestats['area_floors']=admin_noisestats['total_area']*admin_noisestats['floors'].astype(int)

In [None]:
#admin_noisestats['total_area_floors'].hist()
admin_noisestats.head()

In [None]:
# make sure that noise class is an integer
admin_noisestats['noiseclass'] = admin_noisestats['noiseclass'].astype(int)

In [None]:
#weights the areas by noise class in view of severity from low to high (increase by 0.5)
weights = {0: 0.5, 1: 1, 2: 1.5, 3: 2, 4: 2.5}
admin_noisestats['weighted_area'] = admin_noisestats['area_floors'] * admin_noisestats['noiseclass'].map(weights) # used the pandas.map function to assign the weights

In [None]:
#groups data set by district and weighted area
admin_noisestats = admin_noisestats.groupby('district')['weighted_area'].sum().reset_index()
#admin_noisestats['weighted_area'].hist()
#admin_noisestats.head()

In [None]:
#now classify according to percentiles
