# Extracting the required data

In [3]:
import pandas as pd

# Load the Excel file
file_path = "WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS_FULL.xlsx"

# Read the "Estimates" sheet (real header starts at row 17)
df = pd.read_excel(file_path, sheet_name="Estimates", header=16)

# Keep only countries/areas (exclude regions like World, Asia, etc.)
df = df[df["Type"] == "Country/Area"]

# Extract required columns and rename them
training_data = df[
    [
        "Region, subregion, country or area *",
        "Year",
        "Total Fertility Rate (live births per woman)",
        "Life Expectancy at Birth, both sexes (years)",
        "Net Number of Migrants (thousands)",
        "Population Growth Rate (percentage)",
        "Crude Birth Rate (births per 1,000 population)",
        "Total Population, as of 1 July (thousands)"
    ]
].rename(columns={
    "Region, subregion, country or area *": "country",
    "Year": "year",
    "Total Fertility Rate (live births per woman)": "tfr",
    "Life Expectancy at Birth, both sexes (years)": "life_expectancy",
    "Net Number of Migrants (thousands)": "net_migrants",
    "Population Growth Rate (percentage)": "pop_growth_rate",
    "Crude Birth Rate (births per 1,000 population)": "crude_birth_rate",
    "Total Population, as of 1 July (thousands)": "population"
})

# Drop rows with missing actual values
training_data = training_data.dropna()

# Convert population and migrants from thousands â†’ full numbers
training_data["net_migrants"] = training_data["net_migrants"] * 1000
training_data["population"] = training_data["population"] * 1000

# Sort by country (alphabetically) and year
training_data = training_data.sort_values(by=["country", "year"]).reset_index(drop=True)

# Save as CSV
training_data.to_csv("training_data.csv", index=False)

print("training_data.csv created successfully!")
print(training_data.head())

training_data.shape

training_data.csv created successfully!
       country    year    tfr life_expectancy net_migrants pop_growth_rate  \
0  Afghanistan  1950.0  7.248          28.156       6161.0           1.275   
1  Afghanistan  1951.0   7.26          28.584       4903.0            1.36   
2  Afghanistan  1952.0   7.26          29.014        145.0           1.374   
3  Afghanistan  1953.0  7.266          29.452      -8867.0           1.335   
4  Afghanistan  1954.0  7.254          29.698      -6837.0           1.394   

  crude_birth_rate population  
0            49.38  7776176.0  
1           49.624  7879339.0  
2           49.784  7987783.0  
3           49.979  8096698.0  
4           50.004  8207950.0  


(17538, 8)