# Data included

- isocntry: ISO 3166-1 alpha-3 country code
- d11: age exact
- sd18a: democracy satisfaction in country
- d72_2: my voice counts in country
- d1: left-right self-placement
- d25: type of community (urban/rural)
- netuse: internet use
- d8r1: age education ended
- qa8_2: trust in the European Commission

In [47]:
import pandas as pd

pd_ds = pd.read_stata('./ZA7848_v1-0-0.dta', convert_categoricals=False)#.astype(str)

# Print the labels of the columns
print(f"Columns: {pd_ds.columns.tolist()}")

LABEL = 'isocntry'
FEATURES = ['sd18a', 'd72_2', 'd1', 'd11', 'd25', 'netuse', 'qa8_2']
COLS = FEATURES + [LABEL]
NUMERIC_FEATURES = ['d11']
CATEGORICAL_FEATURES = [x for x in FEATURES if x not in NUMERIC_FEATURES]

pd_ds = pd_ds.drop(columns=[col for col in pd_ds.columns if col not in COLS])
original_ds = pd_ds.copy()
# Replace 'Not mentioned' with -1, "Yes" with 1, "No" with 0
# pd_ds = pd_ds.replace({'Not mentioned': -1, 'Yes': 1, 'No': 0})
#ds = datasets.Dataset.from_pandas(pd_ds)
TOTAL_NUMBER_OF_COUNTRIES = len(pd_ds['isocntry'].unique().tolist())
print(f"Total number of countries: {TOTAL_NUMBER_OF_COUNTRIES}")


Columns: ['studyno1', 'studyno2', 'doi', 'version', 'edition', 'survey', 'respondent_serial', 'serialid', 'uniqid', 'tnscntry', 'country', 'isocntry', 'eu27b', 'd11', 'd11r1', 'd11r2', 'q1_1', 'q1_2', 'q1_3', 'q1_4', 'q1_5', 'q1_6', 'q1_7', 'q1_8', 'q1_9', 'q1_10', 'q1_11', 'q1_12', 'q1_13', 'q1_14', 'q1_15', 'q1_16', 'q1_17', 'q1_18', 'q1_19', 'q1_20', 'q1_21', 'q1_22', 'q1_23', 'q1_24', 'q1_25', 'q1_26', 'q1_27', 'q1_28', 'q1_29', 'q1_30', 'q1_31', 'q1_32', 'q1_33', 'q1_34', 'q1_35', 'q1_36', 'q1_37', 'q1_38', 'q1_39', 'q1_40', 'q1_41', 'd70', 'd70a', 'd71_1', 'd71_2', 'd71_3', 'polintr_1', 'polintr_2', 'polintr_3', 'polintr_4', 'qa1_1', 'qa1_2', 'qa1_3', 'qa1_4', 'qa1_5', 'qa1_6', 'qa1_7', 'qa2_1', 'qa2_2', 'qa2_3', 'qa2_4', 'qa2_5', 'qa2_6', 'qa2_7', 'qa3_1', 'qa3_2', 'qa3_3', 'qa3_4', 'qa3_5', 'qa3_6', 'qa3_7', 'qa3_8', 'qa3_9', 'qa3_10', 'qa3_11', 'qa3_12', 'qa3_13', 'qa3_14', 'qa3_15', 'qa3_16', 'qa3_17', 'qa3_18', 'qa4_1', 'qa4_2', 'qa4_3', 'qa4_4', 'qa4_5', 'qa4_6', 'qa4_7', '

In [48]:
import torch
import torch.nn as nn

#pd_ds = pd_ds[:100]

# Transform the dataset into a format that can be used by the model
pd_ds = pd_ds[~pd_ds.isin(['DK (SPONT.)']).any(axis=1)]
# Remove the categories 'Refusal/Implausible', '15 years', '98 years' from the column 'd11'
pd_ds = pd_ds[~pd_ds['d11'].isin(['Refusal/Implausible', '15 years', '98 years'])]
# Remove the category 'Refusal (SPONT.)' from the column 'd1'
pd_ds = pd_ds[~pd_ds['d1'].isin(['Refusal (SPONT.)'])]

mappings = {}

with pd.option_context('display.max_colwidth', None, 'display.max_rows', 3, 'display.max_columns', None):
    display(pd_ds.head(5))
    for col in pd_ds.columns:
        #display(pd_ds.value_counts(col))
        # Display the string of the category and the corresponding code in a table
        display(pd_ds[col].astype('category').cat.categories)
        original_series = pd_ds[col]
        converted_series = pd_ds[col].astype('category').cat.codes
        # Store the mapping of the original series and the converted series
        mappings[col] = dict(zip(original_series, converted_series))
        # Display the original series and the converted series side by side, sorted by the converted series
        # Only display unique values on the first column
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            display(pd.concat([original_series, converted_series], axis=1).drop_duplicates(subset=converted_series.name, keep='first').sort_values(by=0))

Unnamed: 0,isocntry,d11,qa8_2,sd18a,d72_2,d1,d25,netuse
0,AL,62,1,2,3,5,3.0,2
...,...,...,...,...,...,...,...,...
4,AL,52,2,3,1,1,3.0,1


Index(['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CY', 'CY-TCC', 'CZ', 'DE-E',
       'DE-W', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE',
       'IS', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'MT', 'NL', 'NO', 'PL', 'PT',
       'RO', 'RS', 'RS-KM', 'SE', 'SI', 'SK', 'TR'],
      dtype='object')

Unnamed: 0,isocntry,0
0,AL,0
1014,AT,1
2033,BA,2
3039,BE,3
3912,BG,4
28683,CH,5
4948,CY,6
26119,CY-TCC,7
31330,CZ,8
5462,DE-E,9


Int64Index([15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
            32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
            49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
            66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,
            83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99],
           dtype='int64')

Unnamed: 0,d11,0
1053,15,0
1137,16,1
396,17,2
30,18,3
25,19,4
15,20,5
7,21,6
47,22,7
189,23,8
31,24,9


Int64Index([1, 2, 3], dtype='int64')

Unnamed: 0,qa8_2,0
0,1,0
1,2,1
105,3,2


Int64Index([1, 2, 3, 4, 5], dtype='int64')

Unnamed: 0,sd18a,0
21,1,0
0,2,1
2,3,2
26,4,3
1250,5,4


Int64Index([1, 2, 3, 4, 5], dtype='int64')

Unnamed: 0,d72_2,0
4,1,0
1,2,1
0,3,2
22,4,3
1024,5,4


Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 97, 98], dtype='int64')

Unnamed: 0,d1,0
2,1,0
20,2,1
19,3,2
27,4,3
0,5,4
18,6,5
23,7,6
31,8,7
17,9,8
3,10,9


Float64Index([1.0, 2.0, 3.0], dtype='float64')

Unnamed: 0,d25,0
8664,,-1
44,1.0,0
5,2.0,1
0,3.0,2


Int64Index([1, 2, 3, 4, 5, 6, 7], dtype='int64')

Unnamed: 0,netuse,0
1,1,0
0,2,1
243,3,2
336,4,3
733,5,4
398,6,5
245,7,6


In [49]:
# Replace the string values with their corresponding codes
for col in CATEGORICAL_FEATURES:
    pd_ds[col].astype('category').cat.remove_unused_categories()
    pd_ds[col] = pd_ds[col].astype('category').cat.codes

pd_ds[LABEL].astype('category').cat.remove_unused_categories()
pd_ds[LABEL] = pd_ds[LABEL].astype('category').cat.codes

# Convert the dataframe column 'isocntry' into a one-hot encoded tensor
#pd_ds['isocntry'] = pd_ds['isocntry'].map(lambda x: torch.nn.functional.one_hot(torch.tensor(x), num_classes=28))

# Move the column 'country' to the end of the dataframe
#countries = pd_ds['country']
#pd_ds = pd_ds.drop(columns=['country'])
#pd_ds['country'] = countries

# Covert the age column 'd11' into an integer
for col in NUMERIC_FEATURES:
    pd_ds[col] = pd_ds[col].astype('category').cat.remove_unused_categories()
    pd_ds[col] = pd_ds[col].astype('int')

with pd.option_context('display.max_colwidth', None, 'display.max_rows', None, 'display.max_columns', None):
    display(pd_ds.head(1))

Unnamed: 0,isocntry,d11,qa8_2,sd18a,d72_2,d1,d25,netuse
0,0,62,0,1,2,4,2,1


In [50]:
# Perform normalization of the columns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for col in FEATURES:
    pd_ds[col] = scaler.fit_transform(pd_ds[col].values.reshape(-1, 1))

with pd.option_context('display.max_colwidth', None, 'display.max_rows', None, 'display.max_columns', None):
    display(pd_ds.head(5))

Unnamed: 0,isocntry,d11,qa8_2,sd18a,d72_2,d1,d25,netuse
0,0,0.559524,0.0,0.25,0.5,0.363636,1.0,0.166667
1,0,0.535714,0.5,0.25,0.25,0.363636,1.0,0.0
2,0,0.25,0.5,0.5,0.25,0.0,1.0,0.0
3,0,0.5,0.5,0.25,0.25,0.818182,1.0,0.0
4,0,0.440476,0.5,0.5,0.0,0.0,1.0,0.0


In [51]:
# Delete the age and political party columns
#pd_ds = pd_ds.drop(columns=['d11', 'd1', 'd25', 'd17', 'd72_2', 'sd18a', 'netuse'])
#pd_ds['country'] = pd_ds['isocntry'] / 28
# Generate a random number between 0 and 27 for each row
#pd_ds['country'] = pd_ds['isocntry'].map(lambda x: torch.randint(0, 28, (1,)).item())
#pd_ds['isocntry'] = pd_ds['country']

In [52]:
# Create a NN with 2 hidden layers of 50 neurons each
# The model outputs a one-hot vector of size 28
# It takes as input a vector of 6 features

DROPOUT = 0.2

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Number of columns in the dataset minus the column 'isocntry'
        num_features = len(pd_ds.columns) - 1
        self.fc1 = nn.Linear(num_features, 150)
        self.fc2 = nn.Linear(150, 150)
        self.fc3 = nn.Linear(150, 150)
        self.fc4 = nn.Linear(150, TOTAL_NUMBER_OF_COUNTRIES)
        self.activation = nn.ReLU()
        self.softmax = nn.Softmax(dim=0)
        self.dropout = nn.Dropout(p=DROPOUT)

    def forward(self, x):
        x = self.fc1(x)
        x = self.activation(x)
        #x = self.dropout(x)
        x = self.fc2(x)
        x = self.activation(x)
        #x = self.dropout(x)
        #x = self.fc3(x)
        #x = self.activation(x)
        #x = self.dropout(x)
        x = self.fc4(x)
        #x = self.activation(x)
        return x
    
net = Net()

In [53]:
# Generate train and test splits
from sklearn.model_selection import train_test_split
train, test = train_test_split(pd_ds, test_size=0.2)

In [54]:
# Train the net
import torch.optim as optim
torch.manual_seed(0)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(net.parameters(), lr=0.001, weight_decay=0.01)

MINIBATCH_SIZE = 100

for epoch in range(3):  # loop over the dataset multiple times

    for i in range(0, len(train), MINIBATCH_SIZE):
        loss = 0
        # zero the parameter gradients
        optimizer.zero_grad()
        for j in range(i, min(i + MINIBATCH_SIZE, len(train))):
            # get the inputs; data is a list of [inputs, labels]
            inputs = train.iloc[j, 1:].tolist()
            inputs = torch.tensor(inputs, dtype=torch.float)
            label = train.iloc[j, 0]
            label = torch.tensor(label, dtype=torch.long)

            # forward + backward + optimize
            outputs = net(inputs)
            loss += criterion(outputs, label)
        #print(inputs, outputs, labels, loss)
        loss.backward()
        optimizer.step()

        # print statistics
        if (i / MINIBATCH_SIZE) % 15 == 0:
            print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, loss / MINIBATCH_SIZE))
    
    # Evaluate
    correct = 0
    total = 0
    with torch.no_grad():
        for i in range(len(test)):
            inputs = test.iloc[i, 1:].tolist()
            inputs = torch.tensor(inputs, dtype=torch.float)
            label = test.iloc[i, 0]
            label = torch.tensor(label, dtype=torch.long)
            outputs = net(inputs)
            # Get the three highest values
            _, predicted = torch.topk(outputs.data, 1, dim=0)
            # Check if the correct label is in the three highest values
            if label in predicted:
                correct += 1
            total += 1
    
    print('Accuracy of the network on the test set: %d %%' % (
        100 * correct / total))

print('Finished Training')

[1,     1] loss: 3.691
[1,  1501] loss: 3.654
[1,  3001] loss: 3.627
[1,  4501] loss: 3.642
[1,  6001] loss: 3.604
[1,  7501] loss: 3.604
[1,  9001] loss: 3.517
[1, 10501] loss: 3.605
[1, 12001] loss: 3.485
[1, 13501] loss: 3.509
[1, 15001] loss: 3.442
[1, 16501] loss: 3.522
[1, 18001] loss: 3.445
[1, 19501] loss: 3.426
[1, 21001] loss: 3.420
[1, 22501] loss: 3.518
[1, 24001] loss: 3.364
[1, 25501] loss: 3.467
[1, 27001] loss: 3.311
[1, 28501] loss: 3.317
Accuracy of the network on the test set: 10 %
[2,     1] loss: 3.363
[2,  1501] loss: 3.243
[2,  3001] loss: 3.239
[2,  4501] loss: 3.382
[2,  6001] loss: 3.380
[2,  7501] loss: 3.409
[2,  9001] loss: 3.304
[2, 10501] loss: 3.467
[2, 12001] loss: 3.345
[2, 13501] loss: 3.382
[2, 15001] loss: 3.286
[2, 16501] loss: 3.435
[2, 18001] loss: 3.318
[2, 19501] loss: 3.324
[2, 21001] loss: 3.283
[2, 22501] loss: 3.440
[2, 24001] loss: 3.289
[2, 25501] loss: 3.387
[2, 27001] loss: 3.246
[2, 28501] loss: 3.228
Accuracy of the network on the tes

In [55]:
# Test the net
dataiter = iter(pd_ds)

In [41]:
# Export the model to a torchscript model
traced_script_module = torch.jit.trace(net, inputs)
traced_script_module.save("model.pt")

In [42]:
test.head(15)

Unnamed: 0,isocntry,d11,qa8_2,sd18a,d72_2,d1,d25,netuse
9296,15,0.559524,0.5,0.75,0.75,0.363636,0.666667,0.0
992,0,0.261905,0.0,0.5,0.25,0.363636,1.0,0.0
29034,12,0.47619,0.0,0.25,0.5,0.363636,1.0,0.0
28456,35,0.166667,0.0,0.5,0.25,0.363636,0.333333,0.0
7331,11,0.559524,1.0,0.0,0.0,0.363636,0.666667,0.0
12359,19,0.833333,0.0,0.25,0.5,0.454545,0.666667,0.833333
24667,38,0.47619,0.5,0.5,0.25,0.363636,0.666667,0.0
4245,4,0.392857,0.0,0.5,0.25,0.454545,0.333333,0.0
212,0,0.142857,0.0,0.5,0.5,0.363636,1.0,0.0
34942,32,0.511905,0.0,0.5,0.25,0.363636,0.666667,0.0


In [73]:
# Build an interactive widget to test the model
import ipywidgets as widgets
from IPython.display import display

"""
Original features:
- isocntry: ISO 3166-1 alpha-3 country code
- d11: age exact
- sd18a: democracy satisfaction in country
- d72_2: my voice counts in country
- d1: left-right self-placement
- d25: type of community (urban/rural)
- netuse: internet use
- d8r1: age education ended
- qa8_2: trust in the European Commission
"""

# Create a slider for the age
age_slider = widgets.IntSlider(
    value=20,
    min=18,
    max=98,
    step=1,
    description='Age:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)
display(age_slider)

categorical_dropdowns = []
for col in CATEGORICAL_FEATURES:
    # Create a dropdown for the column
    col_dropdown = widgets.Dropdown(
        # The options are the unique values of the column 'd1'
        options=mappings[col].keys(),
        description=f'{col}:',
        disabled=False,
    )
    display(col_dropdown)
    categorical_dropdowns.append(col_dropdown)

# Create a button to run the model
button = widgets.Button(description="Run model")
display(button)

# Create a text widget to display the result
text = widgets.Textarea(
    value='',
    placeholder='',
    description='',
    disabled=True,
    layout=widgets.Layout(width='60%', height='500px')
)
display(text)

# Create a function to run the model
def on_button_clicked(b):
    text.value = f"Predicting..."

    # Get the values of the widgets
    age = age_slider.value
    # Normalize the values
    age = (age - original_ds['d11'].min()) / (original_ds['d11'].max() - original_ds['d11'].min())
    categorical_values = [x.value for x in categorical_dropdowns]
    # Use the mapping to convert the string values to their corresponding codes
    categorical_values = [mappings[col][x] for col, x in zip(CATEGORICAL_FEATURES, categorical_values)]
    # Normalize the values
    categorical_values = [(x - original_ds[col].min()) / (original_ds[col].max() - original_ds[col].min()) for col, x in zip(CATEGORICAL_FEATURES, categorical_values)]
    # Create a list of the values
    inputs = [age] + categorical_values
    print(f"Inputs: {inputs}")
    inputs_tensor = torch.tensor(inputs, dtype=torch.float)
    
    # Run the model
    outputs = net(inputs_tensor)
    # Get the three highest values
    _, predicted = torch.topk(outputs.data, 3, dim=0)
    #predicted = torch.argmax(outputs.data)
    #predicted = predicted.item()

    countries_names_ordered_by_mapping = list(mappings[LABEL].keys())
    countries_names_ordered_by_mapping.sort(key=lambda x: mappings[LABEL][x])
    all_probabilities = [f"{x} ({mappings[LABEL][x]}): {round(y, 2)}" for x, y in zip(countries_names_ordered_by_mapping, outputs.data.tolist())]
    formatted_probabilities = '\n'.join(all_probabilities)
    
    # Display the result
    # Use the mappings to convert the codes back to their corresponding string values. This is a reverse lookup: we need to find the key by the value
    predicted_str = [list(mappings[LABEL].keys())[list(mappings[LABEL].values()).index(x)] for x in predicted]
    text.value = f"Predicted countries: {predicted_str}\n\nAll probabilities:\n{formatted_probabilities}"

button.on_click(on_button_clicked)

IntSlider(value=20, continuous_update=False, description='Age:', max=98, min=18)

Dropdown(description='sd18a:', options=(2, 3, 1, 4, 5), value=2)

Dropdown(description='d72_2:', options=(3, 2, 1, 4, 5), value=3)

Dropdown(description='d1:', options=(5, 1, 10, 9, 6, 3, 2, 7, 4, 8, 98, 97), value=5)

Dropdown(description='d25:', options=(3.0, 2.0, 1.0, nan, nan, nan, nan, nan), value=3.0)

Dropdown(description='netuse:', options=(2, 1, 3, 7, 4, 6, 5), value=2)

Dropdown(description='qa8_2:', options=(1, 2, 3), value=1)

Button(description='Run model', style=ButtonStyle())

Textarea(value='', disabled=True, layout=Layout(height='500px', width='60%'), placeholder='')

Inputs: [0.05952380952380952, 0.0, 0.25, 0.030927835051546393, 0.5, 0.0, -0.5]
Inputs: [0.42857142857142855, 0.0, 0.0, 0.030927835051546393, -0.5, 0.3333333333333333, -0.5]


In [70]:
list(mappings[LABEL].keys())

['AL',
 'AT',
 'BA',
 'BE',
 'BG',
 'CY',
 'DE-W',
 'DE-E',
 'DK',
 'ES',
 'FI',
 'FR',
 'GR',
 'HR',
 'HU',
 'IE',
 'IT',
 'LT',
 'LU',
 'LV',
 'ME',
 'MK',
 'MT',
 'NL',
 'PL',
 'PT',
 'RO',
 'RS',
 'SI',
 'SK',
 'SE',
 'CY-TCC',
 'TR',
 'RS-KM',
 'CH',
 'NO',
 'EE',
 'IS',
 'CZ',
 'GB']