In [72]:
import pandas as pd
import numpy as np
import sklearn as sn
from flask import Flask, render_template
import plotly.express as px
import plotly.io as pio
import json


mort= pd.read_csv('/workspaces/child_mortality/Datasets/child_mortality.csv')
edu = pd.read_csv('/workspaces/child_mortality/Datasets/female_education.csv')
san = pd.read_csv('/workspaces/child_mortality/Datasets/sanitation_services.csv') 

In [65]:
# Define a function to reshape each
def reshape_wide_to_long(df, value_name):
    return df.melt(
        id_vars=['Country', 'Indicator Name'], 
        value_vars=['2018','2019','2020','2021','2022'],
        var_name='year',
        value_name=value_name
    ).drop(columns=['Indicator Name'])

mort_long = reshape_wide_to_long(mort, 'Mortality rate, under-5 (per 1,000 live births)')
edu_long = reshape_wide_to_long(edu, 'Educational attainment, at least completed lower secondary, population 25+, female (%) (cumulative)')
san_long = reshape_wide_to_long(san, 'People using at least basic sanitation services (% of population)')

In [66]:
df = (mort_long
      .merge(edu_long, on=['Country','year'], how='inner')
      .merge(san_long, on=['Country','year'], how='inner'))



In [67]:
# Convert year to integer
df['year'] = df['year'].astype(int)

print(df.head(), df.shape)

    Country  year  Mortality rate, under-5 (per 1,000 live births)  \
0  Ethiopia  2018                                             56.4   
1     Kenya  2018                                             44.3   
2   Nigeria  2018                                            120.2   
3  Ethiopia  2019                                             54.1   
4     Kenya  2019                                             43.3   

   Educational attainment, at least completed lower secondary, population 25+, female (%) (cumulative)  \
0                                                NaN                                                     
1                                                NaN                                                     
2                                           37.01556                                                     
3                                            7.49156                                                     
4                                           55.18

In [68]:
df.isna().sum()

Country                                                                                                0
year                                                                                                   0
Mortality rate, under-5 (per 1,000 live births)                                                        0
Educational attainment, at least completed lower secondary, population 25+, female (%) (cumulative)    5
People using at least basic sanitation services (% of population)                                      0
dtype: int64

In [69]:
df = df.rename(columns={
    'Mortality rate, under-5 (per 1,000 live births)': 'U5MR',
    'Educational attainment, at least completed lower secondary, population 25+, female (%) (cumulative)': 'FemaleEdu',
    'People using at least basic sanitation services (% of population)': 'Sanitation'
})

print(df.columns)

Index(['Country', 'year', 'U5MR', 'FemaleEdu', 'Sanitation'], dtype='object')


In [70]:
# Sort and reset index
df = df.sort_values(['Country', 'year']).reset_index(drop=True)

# Interpolate, forward-fill, backward-fill FemaleEdu
df['FemaleEdu'] = (
    df.groupby('Country')['FemaleEdu']
      .transform(lambda g: g.interpolate(method='linear'))
      .fillna(method='ffill')  # legacy usage
      .fillna(method='bfill')
)

# Verify no remaining missing values
print("Remaining missing in FemaleEdu:", df['FemaleEdu'].isna().sum())

Remaining missing in FemaleEdu: 0



Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



In [74]:
# Save DataFrame to CSV
df.to_csv('processed_data.csv', index=False)

In [73]:
import plotly.express as px

# Line chart U5MR 2018–2022
fig = px.line(df, x='year', y='U5MR', color='Country',
              title='Under‑5 Mortality Trends (2018–2022)')
st.plotly_chart(fig)

# Choropleth map for 2022
fig2 = px.choropleth(df[df.year==2022], locations='Country',
                     color='U5MR', title='2022 U5MR in Sub‑Saharan Africa')
st.plotly_chart(fig2)




DeltaGenerator()

In [75]:
import pandas as pd

# Load and process data
mort = pd.read_csv('/workspaces/child_mortality/Datasets/child_mortality.csv')
edu = pd.read_csv('/workspaces/child_mortality/Datasets/female_education.csv')
san = pd.read_csv('/workspaces/child_mortality/Datasets/sanitation_services.csv')

# Reshape data
def reshape_wide_to_long(df, value_name):
    return df.melt(
        id_vars=['Country', 'Indicator Name'],
        value_vars=['2018', '2019', '2020', '2021', '2022'],
        var_name='year',
        value_name=value_name
    ).drop(columns=['Indicator Name'])

mort_long = reshape_wide_to_long(mort, 'Mortality rate, under-5 (per 1,000 live births)')
edu_long = reshape_wide_to_long(edu, 'Educational attainment, at least completed lower secondary, population 25+, female (%) (cumulative)')
san_long = reshape_wide_to_long(san, 'People using at least basic sanitation services (% of population)')

# Merge data
df = (mort_long
      .merge(edu_long, on=['Country', 'year'], how='inner')
      .merge(san_long, on=['Country', 'year'], how='inner'))

# Convert year to integer
df['year'] = df['year'].astype(int)

# Rename columns
df = df.rename(columns={
    'Mortality rate, under-5 (per 1,000 live births)': 'U5MR',
    'Educational attainment, at least completed lower secondary, population 25+, female (%) (cumulative)': 'FemaleEdu',
    'People using at least basic sanitation services (% of population)': 'Sanitation'
})

# Handle missing values in FemaleEdu
df = df.sort_values(['Country', 'year']).reset_index(drop=True)
df['FemaleEdu'] = (
    df.groupby('Country')['FemaleEdu']
      .transform(lambda g: g.interpolate(method='linear'))
      .ffill()
      .bfill()
)

# Compute and display correlation matrix
corr_matrix = df[['U5MR', 'FemaleEdu', 'Sanitation']].corr()
print("Correlation Matrix:")
print(corr_matrix)

# Optional: Visualize the correlation matrix as a heatmap
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='Blues', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap: U5MR, FemaleEdu, and Sanitation')
plt.show()

Correlation Matrix:
                U5MR  FemaleEdu  Sanitation
U5MR        1.000000   0.167993    0.586153
FemaleEdu   0.167993   1.000000    0.768120
Sanitation  0.586153   0.768120    1.000000
