# NN 2.0

## Defining data functions

Import needed libraries

In [2]:
# imports
import pandas as pd
import tensorflow as tf
import tensorflow.keras as ks
import numpy as np

Get data from file and remove columns with text

In [3]:
def get_swissvotes_data()->pd.DataFrame:
    import re
    
    dataset = pd.read_csv("../data/formatted/swissvotes_dataset_after_1900_utf8.csv", sep=';')
    
    regex = re.compile("pdev_.*")
    to_excl = list(filter(regex.match, dataset.columns))
    
    dataset.drop(columns=to_excl, inplace=True)
    dataset.drop(columns=["legisjahr"], inplace=True)
    dataset.drop(columns=["titel_kurz_d", "titel_kurz_f", "titel_off_d", "titel_off_f", "stichwort"], inplace=True)
    dataset.drop(columns=["swissvoteslink", "anzahl", "anneepolitique", "bkchrono_de", "bkchrono_fr"], inplace=True)
    dataset.drop(columns=["curiavista_de", "curiavista_fr", "urheber", "bkresults_de", "bkresults_fr"], inplace=True)
    dataset.drop(columns=["bfsmap_de", "bfsmap_fr", "nach_cockpit_d", "nach_cockpit_f", "nach_cockpit_e"], inplace=True)
    dataset = dataset[dataset["anr"] < 646] # we don't care about future votes
    
    return dataset
print(f"Defined {get_swissvotes_data}")

Defined <function get_swissvotes_data at 0x0000018694A9D280>


In [4]:
def get_rechtsform_onehot(data:pd.DataFrame = get_swissvotes_data())->pd.DataFrame:
    tensor = tf.one_hot(data["rechtsform"], 5).numpy();
    result = pd.DataFrame(tensor, columns=["ref_obl", "ref_fak", "initiative", "gegen_entw", "stichfr"], index=data.index)
    
    return result.astype(int)
print(f"Defined {get_rechtsform_onehot}")

Defined <function get_rechtsform_onehot at 0x00000186A1646F70>


In [5]:
def get_politikbereich_multihot(data:pd.DataFrame = get_swissvotes_data())->pd.DataFrame:
    polber = data[["d1e1", "d2e1", "d3e1"]]
    polber = polber.replace('.', 0)
    polber = polber.astype(int)
    
    # the names of the columns (they're a bit long)
    cols = ["Staatsordnung", "Aussenpolitik", "Sicherheitspolitik", "Wirtschaft"]
    cols += ["Landwirtschaft", "Öffentliche Finanzen", "Energie", "Verkehr und Infrastruktur"]
    cols += ["Umwelt und Lebensraum", "Sozialpolitik", "Bildung und Forschung", "Kultur, Religion, Medien"]
    
    result = pd.DataFrame(columns=cols, index = data.index)
    for i in range(len(result)):
        row = np.zeros(12)
        for p in polber.iloc[i]:
            if p != 0:
                row[p-1] = 1
        result.iloc[i] = row
    return result.astype(int)

print(f"Defined {get_politikbereich_multihot}")

Defined <function get_politikbereich_multihot at 0x00000186A171FF70>


In [6]:
def get_department_onehot(data:pd.DataFrame = get_swissvotes_data())->pd.DataFrame:
    dep_single = data["dep"].replace('.', 2) # voting at age 18 is the only vote with a '.' and it's dep of inner
    dep_single = dep_single.astype(int)
    tensor = tf.one_hot(dep_single, 8).numpy()
    result = pd.DataFrame(tensor, columns=["EDA", "EDI", "EJPD", "VBS", "EFD", "WBF", "UVEK", "BK"], index=data.index)
    
    return result.astype(int)

print(f"Defined {get_department_onehot}")

Defined <function get_department_onehot at 0x00000186A16DFA60>


In [7]:
def get_bundesrat_onehot(data:pd.DataFrame = get_swissvotes_data())->pd.DataFrame:
    tensor = tf.one_hot(data["br_pos"].replace('.', 3).astype(int), 3).numpy()
    result = pd.DataFrame(tensor, columns=["Für", "Dagegen", "Keine"], index=data.index)
    return result.astype(int)

print(f"Defined {get_bundesrat_onehot}")

Defined <function get_bundesrat_onehot at 0x00000186A196FEE0>


In [8]:
def get_legislatur(low:int, high:int, data:pd.DataFrame = get_swissvotes_data())->pd.DataFrame:
    leg = data["legislatur"]
    def my_map(x:int, x_min:int=leg.min(0), x_max:int=leg.max(0), y_min:int=low, y_max:int=high)->float:
        return (x-x_min)/(x_max-x_min)*(y_max-y_min)+y_min
    
    normalized = data[["legislatur"]].applymap(my_map)
    return normalized

print(f"Defined {get_legislatur}")

Defined <function get_legislatur at 0x00000186A1B15EE0>


In [9]:
def get_parlament_onehot(data:pd.DataFrame = get_swissvotes_data()["nr_pos"])->pd.DataFrame:
    tensor = tf.one_hot(data.astype(int), 3).numpy()
    result = pd.DataFrame(tensor, columns=["Für", "Dagegen", "Keine"], index=data.index)
    return result.astype(int)

print(f"Defined {get_parlament_onehot}")

Defined <function get_parlament_onehot at 0x00000186A1B91F70>


In [10]:
def get_parties(data:pd.DataFrame = get_swissvotes_data())->list:
    import re
    
    regex_incl = re.compile("p_.*")
    regex_excl = re.compile("p_others_.*")
    
    parties_pre = list(filter(regex_incl.match, data.columns))
    parties = [p for p in parties_pre if not regex_excl.match(p)]
    return parties

print(f"Defined {get_parties}")

Defined <function get_parties at 0x00000186A1B153A0>


In [11]:
def normalize_party_reco(data:pd.DataFrame = get_swissvotes_data(), names:list = get_parties())->pd.DataFrame:
    # deal with unwanted values first
    normalized = data[names].replace(".", 0)
    normalized.replace(np.nan, 0, inplace=True)
    normalized = normalized.astype(int)
    normalized.replace([3,4,5,66,9999], 0, inplace=True)
    
    result = pd.DataFrame(index=normalized.index)
    
    for p in names: # go through parties and create one hot encoding
        tensor = tf.one_hot(normalized[p], 3).numpy()
        temp = pd.DataFrame(tensor, columns=[p+"_neutral", p+"_ja", p+"_nein"], index=result.index)
        result = result.join(temp)

    return result.astype(int)
print(f"Defined {normalize_party_reco}")

Defined <function normalize_party_reco at 0x00000186A1E06EE0>


In [12]:
def get_vote_result(data:pd.DataFrame = get_swissvotes_data())->pd.DataFrame:
    result = data["annahme"].replace('.', 0)
    return result.astype(int)

print(f"Defined {get_vote_result}")

Defined <function get_vote_result at 0x00000186A211FAF0>


In [13]:
# creates a dataframe 
def get_canton_results(data:pd.DataFrame = get_swissvotes_data())->pd.DataFrame:
    import re
    regex = re.compile(".*_annahme")
    canton_names = list(filter(regex.match, data.columns))
    return data[canton_names].replace('.', 0).astype(int)

print(f"Defined {get_canton_results}")

Defined <function get_canton_results at 0x00000186A215FEE0>


## Training the net

In [14]:
# get data
def get_data():
    swissvotes = get_swissvotes_data()
    # the inputs used by the neural net are:
        # Rechtsform (one hot),
        # Politikbereich (multi hot),
        # Department (one hot),
        # Position of the Bundesrat (one hot),
        # legislatur (normalized from 1-10),
        # Position of Nationalrat (one hot),
        # Position of Ständerat (one hot),
        # Party recommendations (one hot)
    in_rchtfrm = get_rechtsform_onehot()
    in_poltber = get_politikbereich_multihot()
    in_deprtmt = get_department_onehot()
    in_burapos = get_bundesrat_onehot()
    in_narapos = get_parlament_onehot()
    in_strapos = get_parlament_onehot(swissvotes["sr_pos"])
    in_parties = normalize_party_reco()
    
    inputs = pd.concat([in_rchtfrm, in_poltber, in_deprtmt, in_burapos, in_narapos, in_strapos, in_parties], axis=1)
    
    # the outputs are:
        # result of the votes (binary),
        # result on a canton level (binary)
    out_result = get_vote_result()
    out_canton = get_canton_results()
    
    outputs = pd.concat([out_result, out_canton], axis=1)
    
    return swissvotes, inputs, outputs

print(f"Defined {get_data}")

Defined <function get_data at 0x00000186A215F820>


In [15]:
def create_model(input_size:int = len(get_data()[1].columns), hidden:list=[100, 50, 20],
                 output_size:int = len(get_data()[2].columns), activation:str="relu",
                 activation_output:str="sigmoid", 
                 optimizer=ks.optimizers.SGD(learning_rate=0.1), 
                 loss=ks.losses.BinaryCrossentropy())->ks.models.Sequential:
    model = ks.models.Sequential()
    
    model.add(ks.layers.Dense(units=input_size, activation=activation, name="Input"))
    
    for i in range(len(hidden)):
        model.add(ks.layers.Dense(units=hidden[i], activation=activation, name="Hidden_"+str(i)))
        model.add(ks.layers.Dropout(rate=.1, name="Dropout_"+str(i)))
        
    model.add(ks.layers.Dense(units=output_size, activation=activation_output, name="Output"))
    
    model.compile(optimizer=optimizer, loss=loss, metrics=[ks.metrics.BinaryAccuracy(), 
                                                           ks.metrics.FalseNegatives()])
    
    return model

print(f"Defined {create_model}")

Defined <function create_model at 0x00000186A365C8B0>


In [16]:
def train_model(model:ks.models.Sequential, inputs:pd.DataFrame=get_data()[1], 
                outputs:pd.DataFrame=get_data()[2], test_size:float=0.6, 
                batch_size:int=50, epochs:int=75, shuffle:bool=True)->tuple:
    from sklearn.model_selection import train_test_split as tss
    in_train, in_test, out_train, out_test = tss(inputs, outputs, test_size=test_size)
    
    history = model.fit(x=in_train, y=out_train, batch_size=batch_size, epochs=epochs, shuffle=shuffle)
    
    return history, in_test, out_test

print(f"Defined {train_model}")

Defined <function train_model at 0x00000186A215F790>


In [17]:
model = create_model(hidden=[10, 20], 
                     loss=ks.losses.MeanAbsoluteError(), activation_output="sigmoid")

history, in_test, out_test = train_model(model, epochs=125)

print(model.summary())

Epoch 1/125
Epoch 2/125
Epoch 3/125
Epoch 4/125
Epoch 5/125
Epoch 6/125
Epoch 7/125
Epoch 8/125
Epoch 9/125
Epoch 10/125
Epoch 11/125
Epoch 12/125
Epoch 13/125
Epoch 14/125
Epoch 15/125
Epoch 16/125
Epoch 17/125
Epoch 18/125
Epoch 19/125
Epoch 20/125
Epoch 21/125
Epoch 22/125
Epoch 23/125
Epoch 24/125
Epoch 25/125
Epoch 26/125
Epoch 27/125
Epoch 28/125
Epoch 29/125
Epoch 30/125
Epoch 31/125
Epoch 32/125
Epoch 33/125
Epoch 34/125
Epoch 35/125
Epoch 36/125
Epoch 37/125
Epoch 38/125
Epoch 39/125
Epoch 40/125
Epoch 41/125
Epoch 42/125
Epoch 43/125
Epoch 44/125
Epoch 45/125
Epoch 46/125
Epoch 47/125
Epoch 48/125
Epoch 49/125
Epoch 50/125
Epoch 51/125
Epoch 52/125
Epoch 53/125
Epoch 54/125
Epoch 55/125
Epoch 56/125
Epoch 57/125
Epoch 58/125
Epoch 59/125
Epoch 60/125
Epoch 61/125


Epoch 62/125
Epoch 63/125
Epoch 64/125
Epoch 65/125
Epoch 66/125
Epoch 67/125
Epoch 68/125
Epoch 69/125
Epoch 70/125
Epoch 71/125
Epoch 72/125
Epoch 73/125
Epoch 74/125
Epoch 75/125
Epoch 76/125
Epoch 77/125
Epoch 78/125
Epoch 79/125
Epoch 80/125
Epoch 81/125
Epoch 82/125
Epoch 83/125
Epoch 84/125
Epoch 85/125
Epoch 86/125
Epoch 87/125
Epoch 88/125
Epoch 89/125
Epoch 90/125
Epoch 91/125
Epoch 92/125
Epoch 93/125
Epoch 94/125
Epoch 95/125
Epoch 96/125
Epoch 97/125
Epoch 98/125
Epoch 99/125
Epoch 100/125
Epoch 101/125
Epoch 102/125
Epoch 103/125
Epoch 104/125
Epoch 105/125
Epoch 106/125
Epoch 107/125
Epoch 108/125
Epoch 109/125
Epoch 110/125
Epoch 111/125
Epoch 112/125
Epoch 113/125
Epoch 114/125
Epoch 115/125
Epoch 116/125
Epoch 117/125
Epoch 118/125
Epoch 119/125
Epoch 120/125
Epoch 121/125


Epoch 122/125
Epoch 123/125
Epoch 124/125
Epoch 125/125
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (Dense)                (None, 172)               29756     
_________________________________________________________________
Hidden_0 (Dense)             (None, 10)                1730      
_________________________________________________________________
Dropout_0 (Dropout)          (None, 10)                0         
_________________________________________________________________
Hidden_1 (Dense)             (None, 20)                220       
_________________________________________________________________
Dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
Output (Dense)               (None, 27)                567       
Total params: 32,273
Trainable params: 32,273
Non-trainable params

In [18]:
model.evaluate(x=in_test, y=out_test)



[0.30789774656295776, 0.7117992639541626, 692.0]

In [22]:
get_data()[1]

Unnamed: 0,ref_obl,ref_fak,initiative,gegen_entw,stichfr,Staatsordnung,Aussenpolitik,Sicherheitspolitik,Wirtschaft,Landwirtschaft,...,p_endk_nein,p_fdk_neutral,p_fdk_ja,p_fdk_nein,p_edk_neutral,p_edk_ja,p_edk_nein,p_bpuk_neutral,p_bpuk_ja,p_bpuk_nein
0,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,0,0,1,0,1,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,0,0,0,1,0,1,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
3,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
4,0,0,1,0,0,0,1,0,1,1,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,1,0,0,1,0,0
605,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,1,0,0,1,0,0
606,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
607,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
