In [19]:
import streamlit as st
import plotly.express as px
import pandas as pd
import statsmodels

st.title("Do They Correlate? :thinking_face:")
# st.markdown("### Do 2 World Development Indicators have a relationship?")

st.markdown("#### INTRODUCTION :wave:")

st.markdown("""Do richer countries have higher birth rates? Do older countries smoke more? This app uses World Bank [country-level data](https://data.worldbank.org/) from 2020 to answer these questions and many others! 
            Hopefully giving you a better understanding of the world :globe_with_meridians:.""")

# Create a DataFrame
df = pd.read_parquet('WDIData1960_to_2024.parquet')

st.divider()

st.markdown("#### SELECT TWO VARIABLES TO COMPARE :mag_right:")

# Create a dropdown menu for the x-axis
options = df['Indicator Name'].unique()
x_axis = st.selectbox("Select x-axis:", options)

# Create a dropdown menu for the y-axis
y_axis = st.selectbox("Select y-axis:", options)

# Calculating the latest year where these 2 indicators share data
x_axis_years = set(df[df['Indicator Name'] == x_axis]['Year'].values)
y_axis_years = set(df[df['Indicator Name'] == y_axis]['Year'].values)
years_overlap = x_axis_years.intersection(y_axis_years)
latest_data_year = max(years_overlap)

df_corr = df[(df['Indicator Name'].isin([x_axis, y_axis])) & (df['Year'] == 
    latest_data_year)].pivot_table(values='Value', index=['Country Name', 'Region', 'Year'], columns='Indicator Name').reset_index()
df_corr = df_corr[[x_axis, y_axis]]
# x_axis_latest_data = df[(df['Indicator Name'] == x_axis) & (df['Year'] == latest_data_year)]['Value']
# y_axis_latest_data = df[(df['Indicator Name'] == y_axis) & (df['Year'] == latest_data_year)]['Value']


st.divider()

# Add a summary sentence with the correlation coefficient
r = df_corr[x_axis].corr(df_corr[y_axis]).round(2)


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [23]:
df_corr[x_axis].iloc[:,0]

0       79.735
1       14.460
2       11.564
3       29.989
4        5.910
        ...   
212     29.905
213     42.901
214     52.497
215    116.091
216     92.566
Name: Adolescent fertility rate (births per 1,000 women ages 15-19), Length: 217, dtype: float64

In [21]:
df_corr[x_axis].corr(df_corr[y_axis]).round(2)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [20]:
df_corr

Indicator Name,"Adolescent fertility rate (births per 1,000 women ages 15-19)","Adolescent fertility rate (births per 1,000 women ages 15-19).1"
0,79.735,79.735
1,14.460,14.460
2,11.564,11.564
3,29.989,29.989
4,5.910,5.910
...,...,...
212,29.905,29.905
213,42.901,42.901
214,52.497,52.497
215,116.091,116.091


In [15]:
import streamlit as st
import plotly.express as px
import pandas as pd
import statsmodels

st.title("Do They Correlate? :thinking_face:")
# st.markdown("### Do 2 World Development Indicators have a relationship?")

st.markdown("#### INTRODUCTION :wave:")

st.markdown("""Do richer countries have higher birth rates? Do older countries smoke more? This app uses World Bank [country-level data](https://data.worldbank.org/) from 2020 to answer these questions and many others! 
            Hopefully giving you a better understanding of the world :globe_with_meridians:.""")

# Create a DataFrame
df = pd.read_parquet('WDIData1960_to_2024.parquet')

st.divider()

st.markdown("#### SELECT TWO VARIABLES TO COMPARE :mag_right:")

# Create a dropdown menu for the x-axis
options = df['Indicator Name'].unique()
x_axis = st.selectbox("Select x-axis:", options)

# Create a dropdown menu for the y-axis
y_axis = st.selectbox("Select y-axis:", options)

# Calculating the latest year where these 2 indicators share data
x_axis_years = set(df[df['Indicator Name'] == x_axis]['Year'].values)
y_axis_years = set(df[df['Indicator Name'] == y_axis]['Year'].values)
years_overlap = x_axis_years.intersection(y_axis_years)
latest_data_year = max(years_overlap)

df_corr = df[(df['Indicator Name'].isin([x_axis, y_axis])) & (df['Year'] == 
    latest_data_year)].pivot_table(values='Value', index=['Country Name', 'Region', 'Year'], columns='Indicator Name').reset_index()

# x_axis_latest_data = df[(df['Indicator Name'] == x_axis) & (df['Year'] == latest_data_year)]['Value']
# y_axis_latest_data = df[(df['Indicator Name'] == y_axis) & (df['Year'] == latest_data_year)]['Value']


st.divider()

# Add a summary sentence with the correlation coefficient
r = df_corr[x_axis].corr(df_corr[y_axis]).round(2)


def categorise_correlation_coefficient(r):
    """Categorise the correlation coefficient into weak, moderate or strong."""
    if abs(r) > 0.7:
        strength = 'strong'
    elif abs(r) > 0.3:
        strength = 'moderate'
    else:
        strength = 'weak'

    direction = 'positive' if r > 0 else 'negative'

    return f"{strength} {direction}"


st.markdown("#### RESULTS IN PLAIN ENGLISH :white_check_mark:")

st.markdown(f"""The correlation coefficient is **{r}**, which indicates a {categorise_correlation_coefficient(r)} relationship between the two variables.
    The R-squared value is **{round(r**2, 2)}**, meaning approximately **{round(r**2*100)}%** of the variation in one variable can be explained by the other.""")

st.divider()

st.markdown("#### CHART :chart_with_upwards_trend:")

# Option for log or linear scale
scale_x = st.radio("Log Scale for x-axis:", (False, True), horizontal=True)
scale_y = st.radio("Log Scale for y-axis:", (False, True), horizontal=True)

df_graph = df[(df['Indicator Name'].isin([x_axis, y_axis])) & (df['Year'].isin(
    years_overlap))].pivot_table(values='Value', index=['Country Name', 'Region', 'Year'], columns='Indicator Name').reset_index()

# Create a scatterplot

In [17]:
x_axis

'Adolescent fertility rate (births per 1,000 women ages 15-19)'

In [18]:
df_graph[x_axis].min()

0.876

In [14]:
import streamlit as st
import plotly.express as px
import pandas as pd
import statsmodels

st.title("Do They Correlate? :thinking_face:")
# st.markdown("### Do 2 World Development Indicators have a relationship?")

st.markdown("#### INTRODUCTION :wave:")

st.markdown("""Do richer countries have higher birth rates? Do older countries smoke more? This app uses World Bank [country-level data](https://data.worldbank.org/) from 2020 to answer these questions and many others! 
            Hopefully giving you a better understanding of the world :globe_with_meridians:.""")

# Create a DataFrame
df = pd.read_parquet('WDIData1960_to_2024.parquet')

st.divider()

st.markdown("#### SELECT TWO VARIABLES TO COMPARE :mag_right:")

# Create a dropdown menu for the x-axis
options = df['Indicator Name'].unique()
x_axis = st.selectbox("Select x-axis:", options)

# Create a dropdown menu for the y-axis
y_axis = st.selectbox("Select y-axis:", options)

# Calculating the latest year where these 2 indicators share data
x_axis_years = set(df[df['Indicator Name'] == x_axis]['Year'].values)
y_axis_years = set(df[df['Indicator Name'] == y_axis]['Year'].values)
years_overlap = x_axis_years.intersection(y_axis_years)
latest_data_year = max(years_overlap)

x_axis_latest_data = df[(df['Indicator Name'] == x_axis) & (df['Year'] == latest_data_year)]['Value']
y_axis_latest_data = df[(df['Indicator Name'] == y_axis) & (df['Year'] == latest_data_year)]['Value']

df_graph = df[(df['Indicator Name'].isin([x_axis, y_axis])) & (df['Year'].isin(
    years_overlap))].pivot_table(values='Value', index=['Country Name', 'Region', 'Year'], columns='Indicator Name').reset_index()

st.divider()


# Add a summary sentence with the correlation coefficient
r = x_axis_latest_data.corr(y_axis_latest_data).round(2)


def categorise_correlation_coefficient(r):
    """Categorise the correlation coefficient into weak, moderate or strong."""
    if abs(r) > 0.7:
        strength = 'strong'
    elif abs(r) > 0.3:
        strength = 'moderate'
    else:
        strength = 'weak'

    direction = 'positive' if r > 0 else 'negative'

    return f"{strength} {direction}"


st.markdown("#### RESULTS IN PLAIN ENGLISH :white_check_mark:")

st.markdown(f"""The correlation coefficient is **{r}**, which indicates a {categorise_correlation_coefficient(r)} relationship between the two variables.
    The R-squared value is **{round(r**2, 2)}**, meaning approximately **{round(r**2*100)}%** of the variation in one variable can be explained by the other.""")

st.divider()

st.markdown("#### CHART :chart_with_upwards_trend:")

# Option for log or linear scale
scale_x = st.radio("Log Scale for x-axis:", (False, True), horizontal=True)
scale_y = st.radio("Log Scale for y-axis:", (False, True), horizontal=True)

# Create a scatterplot
fig = px.scatter(df_graph, x=x_axis, y=y_axis, hover_name='Country Name',
                 log_x=scale_x, log_y=scale_y, trendline="ols", color='Region', trendline_scope='overall', opacity=0.8)

In [3]:
import streamlit as st
import plotly.express as px
import pandas as pd
import statsmodels

st.title("Do They Correlate? :thinking_face:")
# st.markdown("### Do 2 World Development Indicators have a relationship?")

st.markdown("#### INTRODUCTION :wave:")

st.markdown("""Do richer countries have higher birth rates? Do older countries smoke more? This app uses World Bank [country-level data](https://data.worldbank.org/) from 2020 to answer these questions and many others! 
            Hopefully giving you a better understanding of the world :globe_with_meridians:.""")

# Create a DataFrame
df = pd.read_parquet('WDIData1960_to_2024.parquet')

st.divider()

st.markdown("#### SELECT TWO VARIABLES TO COMPARE :mag_right:")

# Create a dropdown menu for the x-axis
options = df['Indicator Name'].unique()
x_axis = st.selectbox("Select x-axis:", options)

# Create a dropdown menu for the y-axis
y_axis = st.selectbox("Select y-axis:", options)

# Calculating the latest year where these 2 indicators share data
x_axis_years = set(df[df['Indicator Name'] == x_axis]['Year'].values)
y_axis_years = set(df[df['Indicator Name'] == y_axis]['Year'].values)
years_overlap = x_axis_years.intersection(y_axis_years)
latest_data_year = max(years_overlap)

x_axis_latest_data = df[(df['Indicator Name'] == x_axis) & (df['Year'] == latest_data_year)]['Value']
y_axis_latest_data = df[(df['Indicator Name'] == y_axis) & (df['Year'] == latest_data_year)]['Value']

st.divider()


# Add a summary sentence with the correlation coefficient
r = x_axis_latest_data.corr(y_axis_latest_data).round(2)


def categorise_correlation_coefficient(r):
    """Categorise the correlation coefficient into weak, moderate or strong."""
    if abs(r) > 0.7:
        strength = 'strong'
    elif abs(r) > 0.3:
        strength = 'moderate'
    else:
        strength = 'weak'

    direction = 'positive' if r > 0 else 'negative'

    return f"{strength} {direction}"


st.markdown("#### RESULTS IN PLAIN ENGLISH :white_check_mark:")

st.markdown(f"""The correlation coefficient is **{r}**, which indicates a {categorise_correlation_coefficient(r)} relationship between the two variables.
    The R-squared value is **{round(r**2, 2)}**, meaning approximately **{round(r**2*100)}%** of the variation in one variable can be explained by the other.""")

st.divider()

st.markdown("#### CHART :chart_with_upwards_trend:")

# Option for log or linear scale
scale_x = st.radio("Log Scale for x-axis:", (False, True), horizontal=True)
scale_y = st.radio("Log Scale for y-axis:", (False, True), horizontal=True)

In [6]:
latest_data_year

'2022'

In [8]:
x_axis

'Adolescent fertility rate (births per 1,000 women ages 15-19)'

In [13]:
df[(df['Indicator Name'].isin([x_axis, y_axis])) & (df['Year'].isin(
    years_overlap))].pivot_table(values='Value', index=['Country Name', 'Region', 'Year'], columns='Indicator Name').reset_index()

Indicator Name,Country Name,Region,Year,"Adolescent fertility rate (births per 1,000 women ages 15-19)"
0,Afghanistan,South Asia,1960,138.876
1,Afghanistan,South Asia,1961,138.717
2,Afghanistan,South Asia,1962,138.494
3,Afghanistan,South Asia,1963,138.173
4,Afghanistan,South Asia,1964,140.107
...,...,...,...,...
13666,Zimbabwe,Sub-Saharan Africa,2018,98.507
13667,Zimbabwe,Sub-Saharan Africa,2019,97.354
13668,Zimbabwe,Sub-Saharan Africa,2020,96.180
13669,Zimbabwe,Sub-Saharan Africa,2021,94.312
