# PARSING PDF FILE TO EXTRACT INFORMATION ABOUT VARIABLES AND INSERT INTO METADATA TABLE

In [82]:
import pandas as pd

In [83]:
import os

In [84]:
import openpyxl

In [85]:
import numpy as np
import cvxpy as cp

In [86]:
file_path = "../../GIT/demo-Turin-EuSilc/02_DataSet/" # Set the directory of the EU-SULC files

In [87]:
df = pd.read_excel(os.path.join(file_path,"Calibration_variables.xlsx"))

In [88]:
df.head

<bound method NDFrame.head of               id  classe_eta  genere
0     1571500001           3       2
1     1571500002           5       2
2     1571560001           4       1
3     1571560002           4       2
4     1571680001           4       2
...          ...         ...     ...
4271  2425930002           5       1
4272  2425960001           3       2
4273  2425960002           1       2
4274  2425960003           1       1
4275  2425960004           4       1

[4276 rows x 3 columns]>

In [145]:
np.random.seed(42)
n = len(df)
print(n)

4276


In [90]:
df['design_weight'] = 1.0

In [91]:
df['classe_eta'] = df['classe_eta'].astype(str)
df['genere'] = df['genere'].astype(str)
df['group'] = df['classe_eta'] + "_" + df['genere']

In [92]:
X = pd.get_dummies(df['group'])
print(X)

      1_1  1_2  2_1  2_2  3_1  3_2  4_1  4_2  5_1  5_2
0       0    0    0    0    0    1    0    0    0    0
1       0    0    0    0    0    0    0    0    0    1
2       0    0    0    0    0    0    1    0    0    0
3       0    0    0    0    0    0    0    1    0    0
4       0    0    0    0    0    0    0    1    0    0
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...
4271    0    0    0    0    0    0    0    0    1    0
4272    0    0    0    0    0    1    0    0    0    0
4273    0    1    0    0    0    0    0    0    0    0
4274    1    0    0    0    0    0    0    0    0    0
4275    0    0    0    0    0    0    1    0    0    0

[4276 rows x 10 columns]


In [93]:
calibration_vars = X.columns.tolist()

In [223]:
population_totals = {
    '1_1': 120,
    '1_2': 110,
    '2_1': 100,
    '2_2': 130,
    '3_1': 140,
    '3_2': 150,
    '4_1': 125,
    '4_2': 135,
    '5_1': 115,
    '5_2': 145
}
t = np.array([population_totals[var] for var in calibration_vars])
print(t)

[120 110 100 130 140 150 125 135 115 145]


In [224]:
# Logit bounds: 0.01 ≤ g ≤ 4
lower = 0.01
upper = 4

In [225]:
g = cp.Variable(n)
print(g)

var505


In [226]:
A = X.values  # (n x k)
print(A)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [227]:
w = df['design_weight'].values  # (n, )
print(w)

[1. 1. 1. ... 1. 1. 1.]


In [228]:
constraints = [
    g >= lower,
    g <= upper,
    A.T @ cp.multiply(g, w) == t
]
print(constraints)

[Inequality(Constant(CONSTANT, NONNEGATIVE, ())), Inequality(Variable((4276,), var505)), Equality(Expression(AFFINE, UNKNOWN, (10,)), Constant(CONSTANT, NONNEGATIVE, (10,)))]


In [236]:
objective = cp.Minimize(cp.sum(cp.kl_div(g, np.ones(n))))

In [235]:
problem = cp.Problem(objective, constraints)
problem.solve(verbose=True)

                                     CVXPY                                     
                                     v1.6.4                                    
(CVXPY) Apr 12 07:57:22 PM: Your problem has 4276 variables, 8562 constraints, and 0 parameters.
(CVXPY) Apr 12 07:57:22 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Apr 12 07:57:22 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Apr 12 07:57:22 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Apr 12 07:57:22 PM: Your problem is compiled with the CPP canonicalization backend.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Apr 12 07:57:22 PM: Compiling problem (target solver=CLARAB

1659.7388954082844

In [234]:
df['calibrated_weight'] = g.value * df['design_weight']

In [233]:
print("\nConfronto tra margini campionari e obiettivi:")
sample_totals = A.T @ w
calibrated_totals = A.T @ df['calibrated_weight'].values
results = pd.DataFrame({
    'Group': calibration_vars,
    'Target': t,
    'Original': sample_totals,
    'Calibrated': calibrated_totals
})
print(results)



Confronto tra margini campionari e obiettivi:
  Group  Target  Original  Calibrated
0   1_1     120     228.0       120.0
1   1_2     110     236.0       110.0
2   2_1     100     122.0       100.0
3   2_2     130     122.0       130.0
4   3_1     140     438.0       140.0
5   3_2     150     430.0       150.0
6   4_1     125     632.0       125.0
7   4_2     135     668.0       135.0
8   5_1     115     594.0       115.0
9   5_2     145     806.0       145.0
