### Setting notebook and connecting data
Importing all necessary packeges 

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import os
import json

Connecting all files 

In [None]:
housing_400 = pd.read_csv("../data/Housing_400m_2014_2021.csv", encoding='latin-1')
housing_500 = pd.read_csv("../data/Housing_500m_2014_2021.csv", encoding='latin-1')
housing_1000 = pd.read_csv("../data/Housing_1000m_2014_2021.csv", encoding='latin-1')
housing_2000 = pd.read_csv("../data/Housing_2000m_2014_2021.csv", encoding='latin-1')
housing_total = pd.read_csv("../data/Housing_total_2014_2021.csv", encoding='latin-1')

income = pd.read_csv("../data/Income_median_2014_2021.csv", encoding='latin-1')

### Data cleaning and transformation
Removing first two columns (1 and 2) from all tabels that contains repetitive information.

Subtracting households from each other to ensure each household is calculated once.

In [None]:
# Creating loop for all DataFrames
dataframes = [housing_400, housing_500, housing_1000, housing_2000, housing_total]

for df in dataframes:
    df.drop(df.columns[1:3], axis=1, inplace=True)

for df in dataframes:
    df.set_index('region', inplace = True)

# Substract household from each other to make sure each household is  calcualted once
housing_500_new = housing_500 - housing_400
housing_1000_new = housing_1000 - housing_500
housing_2000_new = housing_2000 - housing_1000
no_connection = housing_total - housing_2000

#### Calculating Weighted Conectivity Score
Conectivity Score was calucated giving higher weight to houses that are closer to public transportation.

In [None]:
# Define weights for each distance range

weights = {
    '400m': 1.0,
    '500m': 0.8,
    '1000m': 0.6,
    '2000m': 0.4,
    'more_2000m': 0.2
}

# Maps each weight key to the corresponding dataframe
dataframes = {
    '400m': housing_400,
    '500m': housing_500_new,
    '1000m': housing_1000_new,
    '2000m': housing_2000_new,
    'more_2000m': no_connection
}

#Multiply all values in DataFrames with corresponding weights
for key, value in weights.items():
    df = dataframes[key]
    new_df = df.mul(value)
    globals()[f'new_{key}'] = new_df

# Sum the dataframes along the columns axis
total_score = new_400m + new_500m + new_1000m + new_2000m + new_more_2000m

log_score = np.log(total_score)
#print(log_score)


# Normalazied score 
normal_score = (log_score - log_score.min()) / (log_score.max() - log_score.min())
normal_score = normal_score.reset_index()

# Creating "id" column from first 4 numbers in "region" column to match the structure of GeoJson file
normal_score['id'] = normal_score['region'].str[:4]
normal_score['region'] = normal_score['region'].str[4:]
#print(normal_score)

To correctly access data in the graph, the DataFrame containing connectivity scores was melted to transform columns into rows, the 'year' column was converted to a numeric type for accuracy, and the DataFrame was pivoted to its original structure.

In [None]:
# Melt the DataFrame to transform columns into rows
melted_score = pd.melt(normal_score, id_vars=['id', 'region'], var_name='year', value_name='score')

# Convert 'year' column to numeric type
melted_score['year'] = pd.to_numeric(melted_score['year'], errors='coerce')

# Pivot the DataFrame
pivot_score = melted_score.pivot_table(index=['id', 'region', 'year'], columns=None, values='score').reset_index()

Conecting GeoJson file with georaphicall data for all municipalities in Sweden

In [None]:
# importing Geojson file 
geo_map = gpd.read_file('../data/swedish_municipalities.geojson')

# merging score data with Geojson
merged_score=geo_map.merge(pivot_score,on="id")

### Data Vizualization
**Public Transportation Accessibility Map**\
Creating Choropleth Map illustrating the connectivity scores of all municipalities over the years 2014-2021.

In [None]:

fig = px.choropleth_mapbox(
    merged_score,
    geojson=merged_score.geometry,
    locations=merged_score.index,
    color="score",  
    hover_name="region", 
    mapbox_style="carto-positron",
    zoom=3.7,
    center={"lat": 63.0, "lon": 17.30},
    opacity=0.7,
    animation_frame='year',
    color_continuous_scale= ['#2133ff', '#cecbd2', '#ea2700'],
    width=600,
    height=800
)
# Adding title and subtitle
fig.update_layout(title_text="Public Transportation Accessibility in Sweden",
                title_font_size=16,
                title_x=0.05,  
                title_y=0.97,

                annotations=[
                dict(text="For years 2014-2021",
                    xref="paper", yref="paper",
                    x=-0.122, y=1.03,  
                    showarrow=False,
                    )])
# Change lines
fig = fig.update_traces(
    marker_line_width=0.3, marker_line_color = 'white')

# Show the map
fig.show()


**Median Connectivity Score Over 2014-2021**\
Line graph  displays the fluctuations in the median connectivity score across all years from 2014 to 2021.

In [None]:
# Calculating median score value over year from DataFrame
average_scores = pivot_score.groupby('year')['score'].median().reset_index()
#print(average_scores)

# Create line graph
fig = px.line(data_frame = average_scores,
                x='year', y= 'score',
                title='Median conectivity score over years',
                markers= True,
                color_discrete_sequence=['#2133ff'],
                template='plotly_white'
                )

# Adjusting layout
fig.update_layout(xaxis=dict(showgrid=False),
                yaxis=dict(showgrid=False),
                yaxis_range=[0.34, 0.38]
              )

# Adding xaxis line
fig.update_xaxes(showline=True, linewidth=1, linecolor='lightgrey')

fig.show()

**Public Transport Accessibility vs. Income**\
Scatterplot illustrates the correlation between income levels and access to public transportation across municipalities from 2014 to 2021.

In [None]:
# Removing unnecessary columns from income DataFrame
income =  income.drop(income.columns[1:4], axis=1)

# Creating "id" column from first 4 numbers in "region" column to match the structure of GeoJson file
income['id'] = income['region'].str[:4]
income['region'] = income['region'].str[4:]

# Melt the DataFrame to transform columns into rows
melted_income = pd.melt(income, id_vars=['id', 'region'], var_name='year', value_name='income')

# Convert 'year' column to numeric type
melted_income['year'] = pd.to_numeric(melted_income['year'], errors='coerce')

# Pivot the DataFrame
pivot_income = melted_income.pivot_table(index=['id', 'region', 'year'], columns=None, values='income').reset_index()
#print(pivot_income)

In [None]:
# Merge income and score data
merge_income = pd.merge(pivot_income, pivot_score, on = ['id', 'region', 'year'])

# Creating scatterplot with income and conectivity score
fig = px.scatter( data_frame = merge_income,
                x= 'income', y='score',  
                animation_frame='year',
                title="Public Transport Accessibility vs. Income",
                hover_name="region",
                color='score',
                color_continuous_scale=['#2133ff', '#cecbd2', '#ea2700'],
                template='plotly_white')

fig.update_xaxes(range=[180, 420])

fig.show()

**Connectivity Score Changes from 2014 to 2021**\
Bar chart showing diffrences in connectivity score between 2014-2021 for each municipality

In [None]:
# Copying only needed colums for calucaling differance
normal_score_growth = normal_score[['id', 'region', '2014', '2021']].copy()

# Substructing score form 2021 with the score from 2014
normal_score_growth['score_change'] = normal_score_growth['2021'] - normal_score_growth['2014']

# creating histogram with score for each municipality
fig = px.bar(data_frame = normal_score_growth,
            x='region',
            y='score_change',
            title= 'Conectivity Score Changes from 2014 to 2021',
            template='plotly_white',
            color= normal_score_growth['score_change'] > 0,  # Condition for positive values
            color_discrete_map={True: '#2133ff', False: '#ea2700'},
            category_orders={'region': sorted(normal_score_growth['region'].unique(), reverse=True)},
)

# Removing 'color' from tooltip and legend
fig.update_traces(hovertemplate='region: %{x} <br>score: %{y}')
fig.update_layout(showlegend=False)

fig.update_yaxes(title="Score change")

fig.show()

**Total Number of Households Across Distances (2014-2021)**\
Line chart illustrates the fluctuation in household numbers across various distances. Given that households within a 400m radius constitute the vast majority, they have been excluded to highlight differences more effectively among other distance groups.

In [None]:
#Calcualting sum of households for each distance
house_400m = housing_400.sum().rename_axis(['year']).reset_index(name='400m')
house_500m = housing_500_new.sum().rename_axis(['year']).reset_index(name='500m')
house_1000m = housing_1000_new.sum().rename_axis(['year']).reset_index(name='1000m')
house_2000m = housing_2000_new.sum().rename_axis(['year']).reset_index(name='2000m')
house_no = no_connection.sum().rename_axis(['year']).reset_index(name='>2000m')

#Joining all DataFrames, settign index for column 'year'
joined_house = pd.concat([house_400m.set_index('year'), house_500m.set_index('year'), house_1000m.set_index('year'), house_2000m.set_index('year'), house_no.set_index('year')], axis=1)

joined_house.reset_index(inplace=True)
#print(joined_house)

In [None]:
#Creating chart for number of houses in each distance over years

fig = px.line(joined_house,
             x='year',
             y=["500m", "1000m", "2000m", ">2000m"],
             markers= True,
             template='plotly_white',
             color_discrete_sequence=["#ea2700", "#eb866d", "#997feb", "#2133ff"]
            )

# Adding title and subtitle
fig.update_layout(title_text="Total Number of Households Across Distances (2014-2021)",
                title_font_size=16,
                title_x=0.05,  
                title_y=0.97,
                
                annotations=[
                dict(text="Excudling 400m distance",
                    xref="paper", yref="paper",
                    x=-0.05, y=1.1,  
                    showarrow=False,
                    )])

# Change title to legend
fig.update_layout(legend_title="Distance")

# Adjusting layout
fig.update_layout(xaxis=dict(showgrid=False),
                yaxis=dict(showgrid=False)
              )

# Adding xaxis line
fig.update_xaxes(showline=True, linewidth=1, linecolor='lightgrey')

fig.update_yaxes(title="Number of households")

fig.show()