In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
import matplotlib.pyplot as plt
from pmdarima import auto_arima


In [2]:
import warnings
warnings.filterwarnings("ignore", message="Maximum Likelihood optimization failed to")
warnings.filterwarnings("ignore", category=UserWarning) 

In [3]:
df = pd.read_csv('data\wdi_fertility_data_prepped.csv')

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,year,year_code,country_name,country_code,total_fertility_rate,gdp_2015_dollar,gdp_per_capita_2015_dollar,gdp_growth,physicians_per_1000,hospital_beds_per_1000,arable_land_hectare_per_person,cpi,crop_production_index,food_production_index,percent_of_household_expenditure_in_gdp,life_expectancy_at_birth,infant_mortality_rate,child_mortality_rate,female_percentage_in_labor
0,160,1990,YR1990,Argentina,ARG,3.034,266000000000.0,8144.494294,-2.467214,2.68,4.5943,0.814243,,39.14,50.98,77.139373,71.784,25.4,28.8,29.11
1,161,1990,YR1990,Brazil,BRA,2.905,917000000000.0,6086.077553,-4.35,1.074,3.3453,0.314121,0.000893,41.03,38.46,59.302498,65.985,52.5,63.0,44.951
2,162,1990,YR1990,France,FRA,1.77,1660000000000.0,28617.44357,2.923935,3.118,9.7,0.306398,71.188126,98.6,101.62,55.205943,76.6,7.5,9.0,46.701
3,163,1990,YR1990,Germany,DEU,1.45,2340000000000.0,29485.70997,5.255006,,10.4,0.150706,67.47511,89.56,99.54,56.346298,75.085634,7.0,8.5,45.355
4,164,1990,YR1990,Poland,POL,2.06,195000000000.0,5111.34533,,2.145,5.7,0.377531,6.983705,128.14,116.13,,70.890244,15.2,17.4,


In [5]:
df.shape

(320, 20)

In [6]:
df.columns

Index(['Unnamed: 0', 'year', 'year_code', 'country_name', 'country_code',
       'total_fertility_rate', 'gdp_2015_dollar', 'gdp_per_capita_2015_dollar',
       'gdp_growth', 'physicians_per_1000', 'hospital_beds_per_1000',
       'arable_land_hectare_per_person', 'cpi', 'crop_production_index',
       'food_production_index', 'percent_of_household_expenditure_in_gdp',
       'life_expectancy_at_birth', 'infant_mortality_rate',
       'child_mortality_rate', 'female_percentage_in_labor'],
      dtype='object')

In [7]:
df.country_name.unique()

array(['Argentina', 'Brazil', 'France', 'Germany', 'Poland', 'Nigeria',
       'Kenya', 'United Kingdom', 'China', 'Japan'], dtype=object)

### Prognozy

In [8]:
# Lista zmiennych 
variables = [
    'total_fertility_rate', 'gdp_2015_dollar', 'gdp_per_capita_2015_dollar',
    'gdp_growth', 'physicians_per_1000', 'hospital_beds_per_1000',
    'arable_land_hectare_per_person', 'cpi', 'crop_production_index',
    'food_production_index', 'percent_of_household_expenditure_in_gdp',
    'life_expectancy_at_birth', 'infant_mortality_rate',
    'child_mortality_rate', 'female_percentage_in_labor'
]


# Liczba lat do prognozy
forecast_years = 5

In [None]:
arima_models_info = {}

countries = df['country_name'].unique()
results = []

for country in countries:
    country_data = df[df['country_name'] == country].sort_values(by='year')
    max_year = country_data['year'].max()
    
    for year_offset in range(1, forecast_years + 1):
        forecast_year = max_year + year_offset
        row = {'country_name': country, 'year': forecast_year}
        
        for var in variables:
            variable_data = country_data[['year', var]].dropna()
            X = variable_data['year'].values.reshape(-1, 1)
            y = variable_data[var].values
            
            # Jeśli brak danych
            if len(y) < 2:
                row[f'{var}_Linear Regression'] = None
                row[f'{var}_ARIMA'] = None
                arima_models_info[(country, var)] = 'Brak danych do modelowania'
                continue
            
            # Regresja Liniowa
            lin_reg = LinearRegression()
            lin_reg.fit(X, y)
            row[f'{var}_Linear Regression'] = lin_reg.predict([[forecast_year]])[0]
            
            # ARIMA
            try:
                arima_model = auto_arima(y, seasonal=False, trace=False, error_action='ignore', suppress_warnings=True)
                best_order = arima_model.order
                arima_models_info[(country, var)] = f'ARIMA({best_order[0]}, {best_order[1]}, {best_order[2]})'
                
                forecast_value = arima_model.predict(n_periods=year_offset)[-1]
                row[f'{var}_ARIMA'] = forecast_value
            except Exception as e:
                row[f'{var}_ARIMA'] = None
                arima_models_info[(country, var)] = f'Błąd: {str(e)}'
        
        results.append(row)




In [21]:
forecast_df = pd.DataFrame(results)
display(forecast_df)

Unnamed: 0,country_name,year,total_fertility_rate_Linear Regression,total_fertility_rate_ARIMA,gdp_2015_dollar_Linear Regression,gdp_2015_dollar_ARIMA,gdp_per_capita_2015_dollar_Linear Regression,gdp_per_capita_2015_dollar_ARIMA,gdp_growth_Linear Regression,gdp_growth_ARIMA,...,percent_of_household_expenditure_in_gdp_Linear Regression,percent_of_household_expenditure_in_gdp_ARIMA,life_expectancy_at_birth_Linear Regression,life_expectancy_at_birth_ARIMA,infant_mortality_rate_Linear Regression,infant_mortality_rate_ARIMA,child_mortality_rate_Linear Regression,child_mortality_rate_ARIMA,female_percentage_in_labor_Linear Regression,female_percentage_in_labor_ARIMA
0,Argentina,2022,1.930744,1.832769,635558500000.0,579806500000.0,14104.620851,12444.31827,0.26015,2.512104,...,62.040861,62.990083,77.476185,75.506323,6.680645,8.3,7.439113,9.4,52.849785,49.339462
1,Argentina,2023,1.897846,1.800071,646179400000.0,589612900000.0,14256.721314,12444.31827,0.123668,2.512104,...,61.721144,62.990083,77.636428,75.622645,6.14254,8.0,6.831067,9.1,53.27807,50.868659
2,Argentina,2024,1.864948,1.762982,656800400000.0,599419400000.0,14408.821776,12444.31827,-0.012814,2.512104,...,61.401427,62.990083,77.79667,75.738968,5.604435,7.7,6.223021,8.8,53.706355,50.917447
3,Argentina,2025,1.83205,1.725893,667421400000.0,609225800000.0,14560.922239,12444.31827,-0.149296,2.512104,...,61.08171,62.990083,77.956912,75.85529,5.066331,7.4,5.614974,8.5,54.134639,51.787144
4,Argentina,2026,1.799152,1.688805,678042300000.0,619032300000.0,14713.022702,12444.31827,-0.285778,2.512104,...,60.761993,62.990083,78.117155,75.971613,4.528226,7.1,5.006928,8.2,54.562924,52.201635
5,Brazil,2022,1.434903,1.637691,1998131000000.0,1869774000000.0,9306.615149,8684.348171,1.547865,2.106493,...,62.938806,61.707867,76.101341,71.491,4.620565,12.921611,4.1625,14.181899,55.652378,52.333358
6,Brazil,2023,1.395164,1.636728,2033012000000.0,1899548000000.0,9412.846558,8765.544128,1.514008,2.106493,...,62.977094,61.82707,76.394125,70.232,3.333743,13.130852,2.611932,14.155813,55.804151,52.301112
7,Brazil,2024,1.355426,1.63811,2067893000000.0,1929323000000.0,9519.077967,8846.740084,1.480152,2.106493,...,63.015382,60.360142,76.686909,68.973,2.046921,13.427721,1.061364,14.222136,55.955923,52.274881
8,Brazil,2025,1.315687,1.641838,2102774000000.0,1959097000000.0,9625.309376,8927.936041,1.446296,2.106493,...,63.05367,60.794469,76.979692,67.714,0.760099,13.81222,-0.489205,14.380866,56.107695,52.253543
9,Brazil,2026,1.275948,1.647912,2137655000000.0,1988871000000.0,9731.540785,9009.131998,1.41244,2.106493,...,63.091958,61.388933,77.272476,66.455,-0.526723,14.284348,-2.039773,14.632005,56.259468,52.236186


In [None]:
arima_models_info

{('Argentina', 'total_fertility_rate'): 'ARIMA(0, 1, 2)',
 ('Argentina', 'gdp_2015_dollar'): 'ARIMA(0, 1, 0)',
 ('Argentina', 'gdp_per_capita_2015_dollar'): 'ARIMA(0, 1, 0)',
 ('Argentina', 'gdp_growth'): 'ARIMA(0, 0, 0)',
 ('Argentina', 'physicians_per_1000'): 'ARIMA(1, 0, 0)',
 ('Argentina', 'hospital_beds_per_1000'): 'ARIMA(1, 0, 0)',
 ('Argentina', 'arable_land_hectare_per_person'): 'ARIMA(0, 1, 0)',
 ('Argentina', 'cpi'): 'Brak danych do modelowania',
 ('Argentina', 'crop_production_index'): 'ARIMA(2, 1, 2)',
 ('Argentina', 'food_production_index'): 'ARIMA(2, 1, 0)',
 ('Argentina', 'percent_of_household_expenditure_in_gdp'): 'ARIMA(0, 1, 0)',
 ('Argentina', 'life_expectancy_at_birth'): 'ARIMA(0, 1, 0)',
 ('Argentina', 'infant_mortality_rate'): 'ARIMA(0, 2, 0)',
 ('Argentina', 'child_mortality_rate'): 'ARIMA(0, 2, 0)',
 ('Argentina', 'female_percentage_in_labor'): 'ARIMA(1, 1, 0)',
 ('Brazil', 'total_fertility_rate'): 'ARIMA(0, 2, 1)',
 ('Brazil', 'gdp_2015_dollar'): 'ARIMA(0, 1, 0

### WYKRESY

In [41]:
variables = [
    'total_fertility_rate', 'gdp_2015_dollar', 'gdp_per_capita_2015_dollar',
    'gdp_growth', 'physicians_per_1000', 'hospital_beds_per_1000',
    'arable_land_hectare_per_person', 'cpi', 'crop_production_index',
    'food_production_index', 'percent_of_household_expenditure_in_gdp',
    'life_expectancy_at_birth', 'infant_mortality_rate',
    'child_mortality_rate', 'female_percentage_in_labor'
]

In [None]:
from docx import Document
from docx.shared import Inches
import os
import matplotlib.pyplot as plt

# Folder temp do zapisywania wykresów
temp_folder = "temp_images"
os.makedirs(temp_folder, exist_ok=True)

doc = Document()
doc.add_heading('Raport z wynikami', level=1)

def plot_pred_and_save_to_doc(country_name, hist_full_df, pred_full_df, variables, arima_info):
    hist_df = hist_full_df[hist_full_df['country_name'] == country_name]
    pred_df = pred_full_df[pred_full_df['country_name'] == country_name]

    for var in variables:
        arima_details = arima_info.get((country_name, var), "BRAK modelu ARIMA")
        doc.add_heading(f"Kraj: {country_name}, Zmienna: {var}", level=2)
        doc.add_paragraph(f"Model: {arima_details}")
        #print(f"Kraj: {country_name}, Zmienna: {var} -> {arima_details}")
        plt.figure(figsize=(10, 6))
        
        plt.plot(
            hist_df['year'], 
            hist_df[var], 
            label=f'Dane', 
            marker='o', color='blue', linestyle='-'
        )
        
        #KMNK
        plt.plot(
            pred_df['year'], 
            pred_df[f'{var}_Linear Regression'], 
            label=f'Prognoza KMNK', 
            marker='o', color='green', linestyle='-'
        )

        #ARIMA
        arima_label = f'Prognoza ARIMA'
        if pred_df[f'{var}_ARIMA'].isnull().all():
            arima_label += ' - BRAK'

        plt.plot(
            pred_df['year'], 
            pred_df[f'{var}_ARIMA'], 
            label=arima_label, 
            marker='o', color='orange', linestyle='-'
        )
        
        plt.title(f'Dane historyczne i prognozy: {var} ({country_name})')
        plt.xlabel('Rok')
        plt.ylabel(var)
        plt.legend()
        plt.grid()
        
        # Zapis wykresu
        image_path = os.path.join(temp_folder, f"{country_name}_{var}.png")
        plt.savefig(image_path, format='png', bbox_inches='tight')
        plt.close()
        doc.add_picture(image_path, width=Inches(6))


for country in countries:
    plot_pred_and_save_to_doc(
        country_name=country, 
        hist_full_df=df, 
        pred_full_df=forecast_df, 
        variables=variables, 
        arima_info=arima_models_info
    )

# Zapis dokumentu
output_file = "raport.docx"
doc.save(output_file)

# Usunięcie plików tymczasowych
for file in os.listdir(temp_folder):
    os.remove(os.path.join(temp_folder, file))
os.rmdir(temp_folder)

print(f"Raport zapisano do pliku: {output_file}")


Kraj: Argentina, Zmienna: total_fertility_rate -> ARIMA(0, 1, 2)
Kraj: Argentina, Zmienna: gdp_2015_dollar -> ARIMA(0, 1, 0)
Kraj: Argentina, Zmienna: gdp_per_capita_2015_dollar -> ARIMA(0, 1, 0)
Kraj: Argentina, Zmienna: gdp_growth -> ARIMA(0, 0, 0)
Kraj: Argentina, Zmienna: physicians_per_1000 -> ARIMA(1, 0, 0)
Kraj: Argentina, Zmienna: hospital_beds_per_1000 -> ARIMA(1, 0, 0)
Kraj: Argentina, Zmienna: arable_land_hectare_per_person -> ARIMA(0, 1, 0)
Kraj: Argentina, Zmienna: cpi -> Brak danych do modelowania
Kraj: Argentina, Zmienna: crop_production_index -> ARIMA(2, 1, 2)
Kraj: Argentina, Zmienna: food_production_index -> ARIMA(2, 1, 0)
Kraj: Argentina, Zmienna: percent_of_household_expenditure_in_gdp -> ARIMA(0, 1, 0)
Kraj: Argentina, Zmienna: life_expectancy_at_birth -> ARIMA(0, 1, 0)
Kraj: Argentina, Zmienna: infant_mortality_rate -> ARIMA(0, 2, 0)
Kraj: Argentina, Zmienna: child_mortality_rate -> ARIMA(0, 2, 0)
Kraj: Argentina, Zmienna: female_percentage_in_labor -> ARIMA(1, 1,