# Imports

In [None]:
# to allow loading large files
import sys
import csv
def find_and_set_sys_maxsize():
    maxInt = sys.maxsize
    decrement = True
    while decrement:
        decrement = False
        try:
            csv.field_size_limit(maxInt)
        except OverflowError:
            maxInt = int(maxInt/10)
            decrement = True
find_and_set_sys_maxsize()

import warnings
def fxn():
    warnings.warn("deprecated", DeprecationWarning)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

    
    
import os
import pandas as pd
import numpy as np

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import nltk
nltk.download('punkt')
# from nltk.tokenize import word_tokenize

import pickle

import math

import datetime
def timestamp(): return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Data loading

In [5]:
granddebat_folder = '../GrandDébat/granddebat.fr/'
democrat_path = os.path.join(granddebat_folder, 'DEMOCRATIE_ET_CITOYENNETE.csv')
events_path = os.path.join(granddebat_folder, 'EVENTS.csv')
fiscalit_path = os.path.join(granddebat_folder, 'LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.csv')
ecolog_path = os.path.join(granddebat_folder, 'LA_TRANSITION_ECOLOGIQUE.csv')
organisat_path = os.path.join(granddebat_folder, 'ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.csv')

french_encoding = "utf-8"

In [9]:
file_path = democrat_path

In [10]:
db_raw = pd.read_csv(file_path, engine='python', encoding=french_encoding)

# Some statistics
* 350k rows overall
* 0.5 not NaNs
* 10 * 5 questions
* 10e6 answers
* 7k zipcodes

# Data preparation

## Prepare text dataframe

Let's leave only text data and normalize the columns names

In [5]:
questions_reverse_index = 37
db_questions = db_raw.loc[:, db_raw.columns[-questions_reverse_index:]]

# (QA4893whateverA3, QA4893whateverA4) -> (A3, A4)
question_list = db_raw.columns[-questions_reverse_index:]
question_list = [el.split()[0][-2:] for el in question_list]
db_questions.columns = question_list

db_questions.head()

Unnamed: 0,A3,A4,A5,Ew,Ex,Ey,Ez,E0,E1,E2,...,M1,M2,M3,M4,M5,Qx,Qy,Qz,Q0,Q1
0,Le citoyen,Non,,,,,,,,,...,,,,,,,,,,Afin d’éviter de creuser les inégalités ne plu...
1,Un instrument de démocratie locale à modernise...,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,Nous proposons le retour à la limitation de vi...
3,Voir l'intégralité de la proposition dans la d...,,,,,,,,,,...,,,,,,,,,,POUR UN NOUVEAU CONTRAT CITOYEN ...
4,"député, maire, moi même",Non,,"Budget participatif, possibilité d'interpeller...",Une bonne chose,,,Oui,,,...,,,,,,,,,,


## Prepare for Doc2Vec

doc2vec wants a list of documents  
every document is contained in a TaggedDocument(words="Le citoyen", tags=["0 A3"]) (par example)  
I form the tags in the "rowIndex columnName" format in case we need to match documents with specific answers  
(actually I haven't used this feature fully)  

In [6]:
question_indices = list(db_questions[db_questions.notna()].stack().index)

In [7]:
question_data = [db_questions.loc[idx] for idx in question_indices]

In [8]:
def index_tuple_to_str(index_tuple):
    return str(index_tuple[0]) + ' ' + str(index_tuple[1])
def str_to_index_tuple(tuple_string):
    return (int(tuple_string.split()[0]), tuple_string.split()[1])

In [9]:
tagged_data = [TaggedDocument(words=gensim.utils.simple_preprocess(question), tags=[index_tuple_to_str(question_indices[idx])]) 
               for idx, question in enumerate(question_data)]

# Training

## Init model

In [None]:
max_epochs = 100
vec_size = 16
alpha = 0.025
min_alpha = 0.00025

model = Doc2Vec(size = vec_size,
                alpha = alpha, 
                min_alpha = min_alpha,
                min_count = 1,
                dm = 1,
                workers=8
               )

In [None]:
model.build_vocab(tagged_data)

## Train model

In [None]:
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples = model.corpus_count,
                epochs = model.iter)
    model.alpha -= 0.0002
    model.min_alpha = model.alpha

In [None]:
model.save("questions_" + timestamp() + ".model")
print("Model Saved")

It takes just **infinite** amount of time to train (it runs on one thread so the cluster won't help)  
I guess we can pass not every answer as a document, or maybe accumulate them by regions in big texts

## Or load it

In [3]:
model = pickle.load(open("questions.model", "rb"))

Then we have the trained doc2vec model that can convert answers to vectors  
In order to save the vector components (there are 16 components) we can form the matrix that has not A3, A4... columns in it, but А3_0, А3_1, ... А3_15, А4_0, А4_1, ... 
So it will be the place for storing all the vectors  

In [None]:
def form_column_name_with_component(name, component):
    return name + "_" + str(component)

db_question_vectors = pd.DataFrame(index=db_questions.index)
for question in db_questions.columns:
    for idx in range(vec_size):
        db_question_vectors[form_column_name_with_component(question, idx)] = pd.Series(dtype='float')

In [23]:
for question in db_questions.columns:
    print(question)
    for el in db_questions[db_questions[question].isna() == False][question].iteritems():
        if index % 10000 == 0:
            print(index)
        index = el[0]
        data = el[1]
        vectorized_question = model.infer_vector(data)
        for component_index, component_value in enumerate(vectorized_question):
            db_question_vectors.loc[index, form_column_name_with_component(question, component_index)] = component_value

A3
0
10000
20000
30000
40000
50000
60000
A4
0
10000
20000
30000
40000
50000
60000
A5
10000
20000
50000
60000
Ew
10000
20000
30000
40000
50000
60000
Ex
10000
20000
30000
40000
50000
60000
Ey
10000
20000
30000
40000
50000
Ez
10000
20000
30000
60000
E0
10000
20000
30000
40000
50000
60000
E1
20000
60000
E2
10000
20000
30000
40000
50000
60000
E3
10000
20000
30000
50000
60000
E4
10000
20000
30000
50000
60000
E5
10000
20000
30000
50000
60000
Iw
10000
20000
30000
50000
60000
Ix
10000
20000
60000
Iy
10000
20000
30000
60000
Iz
10000
20000
30000
60000
I0
10000
20000
30000
40000
50000
60000
I1
10000
20000
40000
50000
60000
I3
10000
20000
30000
40000
60000
I4
10000
20000
30000
40000
60000
I5
10000
20000
40000
50000
60000
Mw
10000
20000
30000
40000
50000
60000
Mx
10000
20000
30000
40000
50000
60000
My
20000
60000
Mz
10000
30000
40000
50000
60000
M0
10000
30000
40000
60000
M1
10000
30000
40000
60000
M2
10000
30000
40000
50000
60000
M3
10000
30000
40000
50000
60000
M4
10000
30000
40000
50000
60000
M5


In [25]:
db_question_vectors.to_csv('db_question_vectors_' +  timestamp() + '.csv')

In [6]:
db_question_vectors = pd.read_csv('db_question_vectors.csv', engine='python', encoding=french_encoding)

In [12]:
from sklearn.model_selection import train_test_split

X = db_question_vectors
y = db_raw['authorZipCode'].apply(lambda x: int(str(x)[0:2]))
print(y.nunique())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

99


In [49]:
import xgboost as xgb

param = {}
param['nthread'] = 8
dtrain = xgb.DMatrix(X_train.iloc[:, :], label=y_train.iloc[:], missing=math.nan)
bst = xgb.train(param, dtrain, num_boost_round = 2)

[20:45:29] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 94 extra nodes, 0 pruned nodes, max_depth=6
[20:45:29] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 106 extra nodes, 0 pruned nodes, max_depth=6


In [50]:
xgb_X_test = xgb.DMatrix(X_test)
xgb_y_test = xgb.DMatrix(y_test)

y_pred = bst.predict(xgb_X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 1.12%


In [43]:
import pickle

pickle.dump(model, open("xboost_" + timestamp() + ".dat", "wb"))

In [82]:
loaded_model = pickle.load(open("xboost_.dat", "rb"))

### Conclusion
So as we can see the XGboost model performed bad on Doc2vec data  
I think the problem is in the fact that doc2vec was not fully trained  
And we should anyway doubt training doc2vec in this way  
We can either aggregate answers by region or by a person  
Or we can use pre trained word2vec models