In [11]:
import pandas as pd
import matplotlib as plt
import numpy as np

In [12]:
# Initializing empty dataframe
interpolated_anemia_df = pd.DataFrame()

try:
    # Read in csv file
    interpolated_anemia_df = pd.read_csv("DataSources/Clean_Data_Anemia.csv")
except FileNotFoundError:
    print("Error: The data source file was not found. Check the path and filename.")
except pd.errors.EmptyDataError:
    print("Error: The file exists but is empty.")
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")

# Original DF
print(interpolated_anemia_df.head())
print('*' * 50)


  REF_AREA REF_AREA_LABEL  TIME_PERIOD  OBS_VALUE
0      AFG    Afghanistan         2000       44.4
1      AFG    Afghanistan         2001       43.9
2      AFG    Afghanistan         2002       43.4
3      AFG    Afghanistan         2003       42.9
4      AFG    Afghanistan         2004       42.4
**************************************************


In [13]:
# First, convert TIME_PERIOD to datetime (assuming it's the start of each year)
interpolated_anemia_df['date'] = pd.to_datetime(interpolated_anemia_df['TIME_PERIOD'], format='%Y')

# Create a  new df with monthly intervals for each country
post_interpolated_dfs = []

for country in interpolated_anemia_df['REF_AREA'].unique():
    # Filter data for this country
    country_data = interpolated_anemia_df[interpolated_anemia_df['REF_AREA'] == country].copy()
    country_data = country_data.sort_values('TIME_PERIOD')

    # Create date range with monthly frequency
    date_range = pd.date_range(
        start=country_data['date'].min(),
        end=country_data['date'].max(),
        freq='MS'  # Month Start
    )

    # Create new dataframe with monthly dates
    new_df = pd.DataFrame({'date': date_range})
    new_df['REF_AREA'] = country
    new_df['REF_AREA_LABEL'] = country_data['REF_AREA_LABEL'].iloc[0]

    # Merge with original data
    new_df = new_df.merge(
        country_data[['date', 'OBS_VALUE']],
        on='date',
        how='left'
    )

    # Interpolate the missing values
    new_df['OBS_VALUE'] = new_df['OBS_VALUE'].interpolate(method='linear')

    # Extract year and month
    new_df['TIME_PERIOD'] = new_df['date'].dt.to_period('M')

    # Append to full df list
    post_interpolated_dfs.append(new_df)

# Combine all countries
result = pd.concat(post_interpolated_dfs, ignore_index=True)

# Reorder columns to match original
result = result[['REF_AREA', 'REF_AREA_LABEL', 'TIME_PERIOD', 'OBS_VALUE']]

print(result.head(20))




   REF_AREA REF_AREA_LABEL TIME_PERIOD  OBS_VALUE
0       AFG    Afghanistan     2000-01  44.400000
1       AFG    Afghanistan     2000-02  44.358333
2       AFG    Afghanistan     2000-03  44.316667
3       AFG    Afghanistan     2000-04  44.275000
4       AFG    Afghanistan     2000-05  44.233333
5       AFG    Afghanistan     2000-06  44.191667
6       AFG    Afghanistan     2000-07  44.150000
7       AFG    Afghanistan     2000-08  44.108333
8       AFG    Afghanistan     2000-09  44.066667
9       AFG    Afghanistan     2000-10  44.025000
10      AFG    Afghanistan     2000-11  43.983333
11      AFG    Afghanistan     2000-12  43.941667
12      AFG    Afghanistan     2001-01  43.900000
13      AFG    Afghanistan     2001-02  43.858333
14      AFG    Afghanistan     2001-03  43.816667
15      AFG    Afghanistan     2001-04  43.775000
16      AFG    Afghanistan     2001-05  43.733333
17      AFG    Afghanistan     2001-06  43.691667
18      AFG    Afghanistan     2001-07  43.650000


In [14]:
# Saving clean data to new file in DataSources directory
output_path = "DataSources/Interpolated_Data_Anemia.csv"

try:
    result.to_csv(output_path, index=False)
    print(f"File saved to {output_path}")
except Exception as e:
    print(f"Failed to save CSV: {e}")

File saved to DataSources/Interpolated_Data_Anemia.csv
