## Load data

In [4]:
#load data
import pandas as pd

original_data_canton_FR = pd.read_excel(r'../dataset/raw/Dep_15_Resultats_T1_complet.xlsx', sheet_name='Cantons', header=2)
# original_data_Bvot_FR        = pd.read_csv('../dataset/inputs/XDataFR_Bvot.csv', sep=';')
# original_data_Bvot_targets_FR = pd.read_csv('../dataset/labels/yDataFR_Bvot.csv', sep=';')

data_canton_FR = original_data_canton_FR.copy()
# data_Bvot_FR   = original_data_Bvot_FR.copy()
# data_Bvot_targets_FR = original_data_Bvot_targets_FR.copy()

In [5]:
################################# fonction utile #############################
def saveData(data, loc):
    # save cher_data as excel
    writer = pd.ExcelWriter(loc)
    
    # write dataframe to excel
    data.to_excel(writer)

    # save the excel
    writer.save()

############################# Format des donnees brute ##############################

def getNbBinomes(data):
    return len([header for header in data.columns if "Binôme" in header])

def explodeLines(data):
    initdf = data[['Code du département', 'Libellé du département', 'Code du canton', 
            'Libellé du canton', 'Inscrits', 'Abstentions', '% Abs/Ins', 'Votants',
            '% Vot/Ins', 'Blancs', '% Blancs/Ins', '% Blancs/Vot', 'Nuls', '% Nuls/Ins',
            '% Nuls/Vot', 'Exprimés', '% Exp/Ins', '% Exp/Vot']]

    headers = ['N°Panneau', 'Nuance', 'Binôme', 'Sièges', 'Voix', '% Voix/Ins', '% Voix/Exp']
    
    df = pd.DataFrame()
    for i in range(getNbBinomes(data)):
        partidf = data[[h+'.'+str(i) if i!=0 else h for h in headers]]
        partidf = pd.concat([initdf, partidf], axis=1)
        partidf.columns = pd.Index(initdf.columns.values.tolist() + headers)
        df = pd.concat([df, partidf])
    
    # Remove useless rows   
    df = df.dropna(how='all', subset=headers)
    
    return df

In [6]:
#explode line 
data_canton_FR = explodeLines(data_canton_FR)

## Dictionnaire des duels

In [7]:
#################################### Dictionnaire des duels #####################################

def getNuanceOfElected(data, col_siege='Sièges', col_nuance='Nuance'):
    elected = data[data[col_siege]=='Elus']
    return list(elected[col_nuance]) if len(elected)!=0 else None

def filterBestNuances(data, col_nuance='Nuance', criteria=12.50):
    bestCandidat = data[data['% Voix/Ins']>= criteria]
    
    if bestCandidat.empty or len(bestCandidat)==1:
        bestCandidat = data.sort_values(by='Voix', ascending=False).iloc[0:2,:]

    return list(bestCandidat[col_nuance])

def getDuels(data, dep, col_dep='Code du département', col_canton='Code du canton', col_siege='Sièges', col_nuance='Nuance',count=None):
    '''
        ATTENTION : data doit etre EXPLODE !
    '''
    data = data[data[col_dep]==dep]
    duels = dict()
    for canton in data[col_canton].unique():
        data_canton = data[data[col_canton]==canton]
        # allow to know if there is a majority in the canton
        elected = getNuanceOfElected(data_canton, col_siege=col_siege, col_nuance=col_nuance) 
        
        if elected is not None:
            count+=1
            duels[str(canton)]= elected
        else:
            duels[str(canton)] = filterBestNuances(data_canton)
    return (duels, count) if count is not None else duels

def optimizeDuelDict(duels):
    optdic = dict()
    for dep, duelDepDict in duels.items():
        for canton, duelList in duelDepDict.items():
            key = ':'.join(duelList)
            if key in optdic.keys():
                optdic[key].append((dep, canton))
            else:
                optdic[key]= [(dep, canton)]
    return optdic

In [8]:
#dictionnaire des duels
duels = dict()
count=0
for dep in data_canton_FR['Code du département'].unique():
    duels[str(dep)], count= getDuels(data_canton_FR, dep, count=count)
    if duels[str(dep)]==[]:
        print('empty list for dep : ', dep)

#dictionnaire ooptimize
optDuels = optimizeDuelDict(duels)
win = [duel.split(':') for duel in list(optDuels.keys()) if len(duel.split(':'))<2]
count

149

In [80]:
duels['14']
count=0
for winner in win:
    count += len(optDuels[winner[0]])
count
optDuels[win[0][0]]

[('2', '20'), ('52', '8'), ('83', '5'), ('84', '14')]

## Preparation des donnees pour le reseau de neuronne

In [22]:
#################################### Data Processing ####################################

def prepareInputDataExploded(data):
    tmp = data[['NUMTOUR', 'CODDPT', 'CODSUBCOM', 'LIBSUBCOM', 'CODBURVOT', 'CODCAN',
            'LIBCAN', 'NBRINS', 'NBRVOT', 'NBREXP', 'CODNUA', 'NBRVOIX']].copy()
    correction = [str(i) for i in range(1,10)]
    #remove canton where there is a winner in the 1st turn
    for winner in win:
         for dep, can in optDuels[winner[0]]:
            if dep in correction:
                tmp = tmp.loc[~((tmp['CODDPT']=='0'+dep) & (tmp['CODCAN']==int(can)))]
            else:
                tmp = tmp.loc[~((tmp['CODDPT']==dep) & (tmp['CODCAN']==int(can)))]
    


    # Compute missing data
    tmp['NBRABS'] = tmp['NBRINS'] - tmp['NBRVOT']
    tmp['NBRBLCNUL'] = tmp['NBRVOT'] - tmp['NBREXP']
    tmp['%ABS/INS'] = tmp['NBRABS'] / tmp['NBRINS']
    tmp['%BLCNUL/VOT'] = tmp['NBRBLCNUL'] / tmp['NBRVOT']
    tmp['%EXP/VOT'] = tmp['NBREXP'] / tmp['NBRVOT']
    tmp['%VOIX/EXP'] = tmp['NBRVOIX'] / tmp['NBREXP']

    nuances = getAllNuances(data)
    statsFeatures = ['NBRINS', 'NBREXP', '%ABS/INS', '%BLCNUL/VOT', '%EXP/VOT']
    idFeatures = ['CODDPT', 'CODCAN', 'CODSUBCOM', 'CODBURVOT']

    exprimes = tmp[idFeatures + ['NBREXP']].drop_duplicates().sort_values(idFeatures)['NBREXP']
    stats = tmp[idFeatures + statsFeatures].drop_duplicates()[statsFeatures]
    ids = tmp[idFeatures].drop_duplicates()

    # Create [%Voix] and fill it
    voix = pd.DataFrame(0, index=data.index, columns=nuances)
    for parti in nuances:
        voix[parti][data['CODNUA']==parti] = tmp[tmp['CODNUA']==parti]['NBRVOIX']
    voix = pd.concat([tmp[idFeatures], voix], axis=1).groupby(idFeatures).sum()[nuances]
    voix.index = exprimes.index

    # Concat with computed stats and divide almost everything by Exprimés
    voix = voix.divide(exprimes, axis=0)
    X = pd.concat([stats, voix], axis=1)
    X.index = pd.MultiIndex.from_frame(ids)
    return X.sort_values(['CODDPT', 'CODCAN', 'CODSUBCOM', 'CODBURVOT'])

def getAllNuances(data, colNuance='CODNUA', fmt='exploded'):
    if fmt not in ['exploded', 'line']:
        raise ValueError("format parameter must be 'exploded' or 'line'")
    
    if fmt == 'exploded':
        nuances = data[colNuance].unique()
    
    if fmt == 'line':
        nuances = np.array([])
        nuances_tmp = data[colNuance].fillna(0)
        for c in nuances_tmp:
            nuances = np.append(nuances, nuances_tmp[c])
        nuances = np.unique(nuances[nuances!=0])
    
    return sorted(nuances)

# retourne un dataset 
def getTrainSets(X, y, duel, col_canton='CODCAN', col_dep='Code du département'):
    mask_dep    = pd.Series([False]*len(X))
    mask_canton = pd.Series([False]*len(X))
    for dep in X[col_dep].unique():
        for canton in X[X[col_dep]==dep][col_canton].unique(): #pour chaque canton du departement
            #on considere que les cantons qui ont comme partis les partis du duel
            if all(nuance in X.loc[(X[col_dep]==dep) & (X[col_canton]==canton)] for nuance in duel):
                mask_dep    |= X[col_dep]==dep
                mask_canton |= X[col_canton]==canton

    # filtre les departement dont aucun canton contient notre duel
    X= X[mask_dep]
    y = y[mask_dep]
    return (X[mask_canton], y[mask_canton])

def prepareLabelsExploded(data, oneHotEncode=False):
    nuances = getAllNuances(data)
    idFeatures = ['CODDPT', 'CODCAN', 'CODSUBCOM', 'CODBURVOT']

    exprimes = data[idFeatures+['NBREXP']].groupby(idFeatures).first()

    # Create [%Voix] and fill it
    voix = pd.DataFrame(0, index=data.index, columns=nuances)
    for parti in nuances:
        voix[parti][data['CODNUA']==parti] = data[data['CODNUA']==parti]['NBRVOIX']
    voix = pd.concat([data[idFeatures], voix], axis=1).groupby(idFeatures).sum().sort_values(idFeatures)[nuances]

    # Concat with computed stats and divide voix by exprimes
    y = voix.divide(exprimes['NBREXP'], axis=0)
    return y

In [23]:
dtypes = {
    'NUMTOUR' :    'int64',
    'CODDPT' :    'object',
    'CODSUBCOM' :  'int64',
    'LIBSUBCOM' : 'object',
    'CODBURVOT' : 'object',
    'CODCAN' :     'int64',
    'LIBCAN' :    'object',
    'NBRINS' :     'int64',
    'NBRVOT' :     'int64',
    'NBREXP' :     'int64',
    'NUMDEPCAND' : 'int64',
    'LIBLISEXT' : 'object',
    'CODNUA' :    'object',
    'NBRVOIX' :    'int64',
}

#load data
dataBvot = pd.read_csv('../dataset/raw/DP15_Bvot_T1T2.csv', delimiter=';', dtype=dtypes)
dataT1Bvot = dataBvot[dataBvot.NUMTOUR==1]
dataT2Bvot = dataBvot[dataBvot.NUMTOUR==2]

# dataT1Bvot.info()

In [24]:
## preparing data


print('Preparing input data... ', end='')
X = prepareInputDataExploded(dataT1Bvot)


print('Preparing labels... ', end='')
y = prepareLabelsExploded(dataT2Bvot)

print(f'shape X = {X.shape} shape y = {y.shape}')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  voix[parti][data['CODNUA']==parti] = tmp[tmp['CODNUA']==parti]['NBRVOIX']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  voix[parti][data['CODNUA']==parti] = tmp[tmp['CODNUA']==parti]['NBRVOIX']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  voix[parti][data['CODNUA']==parti] = tmp[tmp['CODNUA']==parti]['NBRVOIX']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

## build model

In [None]:
tf.keras.backend.clear_session()

model = tf.keras.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(160,)))
model.add(tf.keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model.add(tf.keras.layers.Dense(20, activation='softmax'))

opt = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [25]:
history = model.fit(trainset, y, batch_size=32, validation_data=(testset, y), epochs=100, verbose=0)

# evaluate the model
_, train_acc = model.evaluate(trainset, y, verbose=0)
_, test_acc = model.evaluate(testset, y, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))
# plot loss during training
pyplot.subplot(211)
pyplot.title('Loss')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
# plot accuracy during training
pyplot.subplot(212)
pyplot.title('Accuracy')
pyplot.plot(history.history['accuracy'], label='train')
pyplot.plot(history.history['val_accuracy'], label='test')
pyplot.legend()
pyplot.show()

NameError: name 'model' is not defined