In [198]:
import numpy as np
import csv
import gc

In [199]:
# This function reads the csv file and keeps the useful columns
def load_raw_data(filename):
    X = []
    y = []
    with open(filename, newline='',encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=',')
        count = 0
        fields = 0
        for row in reader:
            if (count == 0):
                count += 1
                fields = len(row)
            # We keep only the questions with a valid ID and a valid number of rows
            if ((len(row[0])==60) and len(row)==fields) :
                data_vector = np.array(row)
                dvx = data_vector[[2,9]]
                dvx = np.append(dvx,data_vector[11:])
                X.append(dvx)
                dvy = data_vector[10]
                y.append(dvy)
                # We do this in order to free up memory
                data_vector = None
        #  call the garbage collector
        gc.collect()
    return X, y

# This function creates a column vector from a matrix
def column(matrix, i):
    return [row[i] for row in matrix]

In [200]:
# Charging the data from the csv file
X,y = load_raw_data("LA_TRANSITION_ECOLOGIQUE.csv")

# Preprocessing of the ZIP code. We just need to convert it from str to int
y = np.array(y)
y = np.asarray(y,dtype='int')


In [201]:
import re
import nltk #import the natural language toolkit library
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem.snowball import FrenchStemmer #import the French stemming library
from nltk.corpus import stopwords #import stopwords from nltk corpus
from collections import Counter #allows for counting the number of occurences in a list

def get_tokens(raw,encoding='utf8'):
    '''get the nltk tokens from a text'''
    tokens = nltk.word_tokenize(raw) #tokenize the raw UTF-8 text
    return tokens

def get_nltk_text(raw,encoding='utf8'):
    '''create an nltk text using the passed argument (raw) after filtering out the commas'''
    #turn the raw text into an nltk text object
    no_commas = re.sub(r'[.|,|\'|;|\"|(|)|’]',' ', raw) #filter out all the commas, periods, and appostrophes using regex
    tokens = nltk.word_tokenize(no_commas) #generate a list of tokens from the raw text
    text=nltk.Text(tokens,encoding) #create a nltk text from those tokens
    return text

def no_accents(raw,encoding='utf8'):
    '''turns any special character to standard ones.'''
    no_accent_a = re.sub(r'[à|â]','a', raw)
    no_accent_c = re.sub(r'[ç]','c', no_accent_a)
    no_accent_e = re.sub(r'[è|é|ê|ë]','e', no_accent_c)
    no_accent_i = re.sub(r'[î|ï]','i', no_accent_e)
    no_accent_o = re.sub(r'[ô]','o', no_accent_i)
    no_accent_u = re.sub(r'[û|ù|ü]','u', no_accent_o)
    
    return no_accent_u

def get_stopswords(type="veronis"):
    '''returns the veronis stopwords in unicode, or if any other value is passed, it returns the default nltk french stopwords'''
    if type=="veronis":
        #VERONIS STOPWORDS
        raw_stopword_list = ["Ap.", "Apr.", "GHz", "MHz", "USD", "a", "afin", "ah", "ai", "aie", "aient", "aies", "ait", "alors", "après", "as", "attendu", "au", "au-delà", "au-devant", "aucun", "aucune", "audit", "auprès", "auquel", "aura", "aurai", "auraient", "aurais", "aurait", "auras", "aurez", "auriez", "aurions", "aurons", "auront", "aussi", "autour", "autre", "autres", "autrui", "aux", "auxdites", "auxdits", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avez", "aviez", "avions", "avons", "ayant", "ayez", "ayons", "b", "bah", "banco", "ben", "bien", "bé", "c", "c'", "c'est", "c'était", "car", "ce", "ceci", "cela", "celle", "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "celà", "cent", "cents", "cependant", "certain", "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "cf.", "cg", "cgr", "chacun", "chacune", "chaque", "chez", "ci", "cinq", "cinquante", "cinquante-cinq", "cinquante-deux", "cinquante-et-un", "cinquante-huit", "cinquante-neuf", "cinquante-quatre", "cinquante-sept", "cinquante-six", "cinquante-trois", "cl", "cm", "cm²", "comme", "contre", "d", "d'", "d'après", "d'un", "d'une", "dans", "de", "depuis", "derrière", "des", "desdites", "desdits", "desquelles", "desquels", "deux", "devant", "devers", "dg", "différentes", "différents", "divers", "diverses", "dix", "dix-huit", "dix-neuf", "dix-sept", "dl", "dm", "donc", "dont", "douze", "du", "dudit", "duquel", "durant", "dès", "déjà", "e", "eh", "elle", "elles", "en", "en-dehors", "encore", "enfin", "entre", "envers", "es", "est", "et", "eu", "eue", "eues", "euh", "eurent", "eus", "eusse", "eussent", "eusses", "eussiez", "eussions", "eut", "eux", "eûmes", "eût", "eûtes", "f", "fait", "fi", "flac", "fors", "furent", "fus", "fusse", "fussent", "fusses", "fussiez", "fussions", "fut", "fûmes", "fût", "fûtes", "g", "gr", "h", "ha", "han", "hein", "hem", "heu", "hg", "hl", "hm", "hm³", "holà", "hop", "hormis", "hors", "huit", "hum", "hé", "i", "ici", "il", "ils", "j", "j'", "j'ai", "j'avais", "j'étais", "jamais", "je", "jusqu'", "jusqu'au", "jusqu'aux", "jusqu'à", "jusque", "k", "kg", "km", "km²", "l", "l'", "l'autre", "l'on", "l'un", "l'une", "la", "laquelle", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lez", "lors", "lorsqu'", "lorsque", "lui", "lès", "m", "m'", "ma", "maint", "mainte", "maintes", "maints", "mais", "malgré", "me", "mes", "mg", "mgr", "mil", "mille", "milliards", "millions", "ml", "mm", "mm²", "moi", "moins", "mon", "moyennant", "mt", "m²", "m³", "même", "mêmes", "n", "n'avait", "n'y", "ne", "neuf", "ni", "non", "nonante", "nonobstant", "nos", "notre", "nous", "nul", "nulle", "nº", "néanmoins", "o", "octante", "oh", "on", "ont", "onze", "or", "ou", "outre", "où", "p", "par", "par-delà", "parbleu", "parce", "parmi", "pas", "passé", "pendant", "personne", "peu", "plus", "plus_d'un", "plus_d'une", "plusieurs", "pour", "pourquoi", "pourtant", "pourvu", "près", "puisqu'", "puisque", "q", "qu", "qu'", "qu'elle", "qu'elles", "qu'il", "qu'ils", "qu'on", "quand", "quant", "quarante", "quarante-cinq", "quarante-deux", "quarante-et-un", "quarante-huit", "quarante-neuf", "quarante-quatre", "quarante-sept", "quarante-six", "quarante-trois", "quatorze", "quatre", "quatre-vingt", "quatre-vingt-cinq", "quatre-vingt-deux", "quatre-vingt-dix", "quatre-vingt-dix-huit", "quatre-vingt-dix-neuf", "quatre-vingt-dix-sept", "quatre-vingt-douze", "quatre-vingt-huit", "quatre-vingt-neuf", "quatre-vingt-onze", "quatre-vingt-quatorze", "quatre-vingt-quatre", "quatre-vingt-quinze", "quatre-vingt-seize", "quatre-vingt-sept", "quatre-vingt-six", "quatre-vingt-treize", "quatre-vingt-trois", "quatre-vingt-un", "quatre-vingt-une", "quatre-vingts", "que", "quel", "quelle", "quelles", "quelqu'", "quelqu'un", "quelqu'une", "quelque", "quelques", "quelques-unes", "quelques-uns", "quels", "qui", "quiconque", "quinze", "quoi", "quoiqu'", "quoique", "r", "revoici", "revoilà", "rien", "s", "s'", "sa", "sans", "sauf", "se", "seize", "selon", "sept", "septante", "sera", "serai", "seraient", "serais", "serait", "seras", "serez", "seriez", "serions", "serons", "seront", "ses", "si", "sinon", "six", "soi", "soient", "sois", "soit", "soixante", "soixante-cinq", "soixante-deux", "soixante-dix", "soixante-dix-huit", "soixante-dix-neuf", "soixante-dix-sept", "soixante-douze", "soixante-et-onze", "soixante-et-un", "soixante-et-une", "soixante-huit", "soixante-neuf", "soixante-quatorze", "soixante-quatre", "soixante-quinze", "soixante-seize", "soixante-sept", "soixante-six", "soixante-treize", "soixante-trois", "sommes", "son", "sont", "sous", "soyez", "soyons", "suis", "suite", "sur", "sus", "t", "t'", "ta", "tacatac", "tandis", "te", "tel", "telle", "telles", "tels", "tes", "toi", "ton", "toujours", "tous", "tout", "toute", "toutefois", "toutes", "treize", "trente", "trente-cinq", "trente-deux", "trente-et-un", "trente-huit", "trente-neuf", "trente-quatre", "trente-sept", "trente-six", "trente-trois", "trois", "très", "tu", "u", "un", "une", "unes", "uns", "v", "vs", "vers", "via", "vingt", "vingt-cinq", "vingt-deux", "vingt-huit", "vingt-neuf", "vingt-quatre", "vingt-sept", "vingt-six", "vingt-trois", "vis-à-vis", "voici", "voilà", "vos", "votre", "vous", "w", "x", "y", "z", "zéro", "à", "ç'", "ça", "ès", "étaient", "étais", "était", "étant", "étiez", "étions", "été", "étée", "étées", "étés", "êtes", "être", "ô"]
    else:
        #get French stopwords from the nltk kit
        raw_stopword_list = stopwords.words('french') #create a list of all French stopwords
    stopword_list = raw_stopword_list
    #stopword_list = [word.decode('utf8') for word in raw_stopword_list] #make to decode the French stopwords as unicode objects rather than ascii
    return stopword_list

def filter_stopwords(text,stopword_list):
    '''normalizes the words by turning them all lowercase and then filters out the stopwords'''
    lower_words=[w.lower() for w in text] #normalize the words in the text, making them all lowercase
    words=[no_accents(w) for w in lower_words]
    #filtering stopwords
    filtered_words = [] #declare an empty list to hold our filtered words
    for word in words: #iterate over all words from the text
        if word not in stopword_list and word.isalpha() and len(word) > 1: #only add words that are not in the French stopwords list, are alphabetic, and are more than 1 character
            filtered_words.append(word) #add word to filter_words list if it meets the above conditions
    filtered_words.sort() #sort filtered_words list
    return filtered_words

def stem_words(words):
    '''stems the word list using the French Stemmer'''
    #stemming words
    stemmed_words = [] #declare an empty list to hold our stemmed words
    stemmer = FrenchStemmer() #create a stemmer object in the FrenchStemmer class
    for word in words:
        stemmed_word=stemmer.stem(word) #stem the word
        stemmed_words.append(stemmed_word) #add it to our stemmed word list
    stemmed_words.sort() #sort the stemmed_words
    return stemmed_words

def sort_dictionary(dictionary):
    '''returns a sorted dictionary (as tuples) based on the value of each key'''
    return sorted(dictionary.items(), key=lambda x: x[1], reverse=True)

def normalize_counts(counts):
    '''returns the frequency of tokens for each text'''
    total = sum(counts.values())
    return dict((word, float(count)/total) for word,count in counts.items())


[nltk_data] Downloading package punkt to /Users/QQINO/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/QQINO/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [202]:
def preprocessing_text(word_vector):
    '''return a vector of questions as a frequency of words'''
    vector = []
    for i in word_vector:
        # TODO: Remove tabs (\t) too
        raw = re.sub(r'\s+', ' ',i) #remove the excess whitespace from the raw text
        text = get_nltk_text(raw)
        stopword_list = get_stopswords()
        filtered_text = filter_stopwords(text,stopword_list)
        stemed_text = stem_words(filtered_text)
        bag_words = sort_dictionary(Counter(stemed_text))
        vector.append(bag_words)
    gc.collect()
    return vector

def preprocessing_yesno(word_vector):
    '''return a vector of yes/no questions where 1 is yes, -1 no and - is no answer'''
    vector = []
    for i in word_vector:
        if i == 'Oui':
            vector.append(1)
        elif i == 'Non':
            vector.append(-1)
        else: vector.append(0)
    return vector

def preprocessing_categories(word_vector,category_vector):
    '''return the category of the participant:
     1 citoyen/citoyenne,
     2 élu,élue or institution
     3 Organisation à but lucratif
     4 Organisation à but non lucratif 
     0 No category'''
    vector = []
    for i in word_vector:
        found = False
        for j in range(len(category_vector)):
            if i == category_vector[j]:
                found = True
                vector.append(j+1)
                break
        if not found : vector.append(0)
    return vector

In [209]:
def preprocessing_raw_data(X,y,array_yes_no,array_category,category_labels):

    # Preprocessing questions
    questions = []
    n = len(X[0])
    for i in range(n):
        print("There are",len(X[0])," iterations. Count: ",i)
        if i in array_yes_no:
            questions.append(preprocessing_yesno(column(X,i)))
        elif i in array_category:
            questions.append(preprocessing_categories(column(X,i),category_labels.pop(0)))
        else:
            questions.append(preprocessing_text(column(X,i)))

    questions_trans = np.transpose(np.array(questions))
    output = np.array(y).reshape((len(y),1))
    data = np.concatenate((questions_trans,output),axis=1)
    gc.collect()
    return data

In [225]:
# Treatement of Transition_ecologique dataset
user_category = ['Citoyen / Citoyenne','Élu / élue et Institution',
                 'Organisation à but lucratif','Organisation à but non lucratif']

mobilite_category = ['Oui','Non',"Je n'utilise pas la voiture pour des déplacements quotidiens"]
labels = [user_category,mobilite_category]
data = preprocessing_raw_data(X,y,[4,6,10],[1,12],labels)


There are 18  iterations. Count:  0
There are 18  iterations. Count:  1
There are 18  iterations. Count:  2
There are 18  iterations. Count:  3
There are 18  iterations. Count:  4
There are 18  iterations. Count:  5
There are 18  iterations. Count:  6
There are 18  iterations. Count:  7
There are 18  iterations. Count:  8
There are 18  iterations. Count:  9
There are 18  iterations. Count:  10
There are 18  iterations. Count:  11
There are 18  iterations. Count:  12
There are 18  iterations. Count:  13
There are 18  iterations. Count:  14
There are 18  iterations. Count:  15
There are 18  iterations. Count:  16
There are 18  iterations. Count:  17


In [246]:
def write_data(word_vector,filename):
    '''return a csv file from a vector '''
    with open(filename, 'w', newline='',encoding='utf-8') as f:
        writer = csv.writer(f,delimiter=';',quoting=csv.QUOTE_NONE)
        writer.writerows(word_vector)

def read_data(filename,yes_no_array):
    '''return a data of cleaned data from a csv file. The order of the columns is the following
    Title | Category of the participant | Questions | Output'''
    data = []
    with open(filename, newline='',encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=';')
        for row in reader:
            element = []
            for i in range(len(row)-1):
                if i in yes_no_array:
                    element.append(int(row[i]))
                else: element.append(read_dictionary(row[i]))
            element.append(row.pop())
            data.append(element)
        #  call the garbage collector
        gc.collect()
    return data

def read_word_count(filename):
    words = []
    with open(filename, newline='',encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=';')
        for row in reader:
            dictionary = dict()
            for i in row:
                dictionary.update(read_dictionary(i))
            words.append(dictionary)
        #  call the garbage collector
        gc.collect()
    return words

def read_dictionary(dictionary_string):
    '''return a dictionary from a string '''
    dictionary = dict()
    clean_dict = re.sub(r'[ \[ | \] | \( | \) | \' | { | } ]','', dictionary_string)
    parsed_dict = re.split(r'[,|:]', clean_dict)
    for i in range(int(len(parsed_dict)/2)):
        key = parsed_dict[2*i]
        value = int(parsed_dict[2*i+1])
        dictionary[key] = value
    return dictionary
    
        


In [247]:
#write_data(data,'data.csv')

data = read_data('data.csv',[1,4,6,10,12])

In [213]:
def find_entries_pattern(data,pattern):
    ''' return an array with all the entries of a dataset whose postal code begins
    with a given pattern '''
    entries = []
    for entry in data:
        match = re.match(pattern,entry[-1])
        if (match != None):
            entries.append(entry)
    return entries

In [214]:
b = find_entries_pattern(data,'511')

In [215]:
def find_zip_codes_by_village(density_threshold,filename='correspondance-code-insee-code-postal.csv'):
    codes_cities = set()
    tmp_code_cities = set()
    codes_villages = set()
    tmp_code_villages = set()
    with open(filename, newline='',encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=';')
        next(reader)
        for row in reader:
            density = -1
            # We keep only the questions with a valid ID and a valid number of rows
            if row[8] != '' and row[7] != '':
                density = float(row[8])*1000 / float(row[7])
            if density >= density_threshold:
                codes_cities.add(row[1])
            elif density != -1: codes_villages.add(row[1])
    tmp_code_cities = codes_cities.copy()
    tmp_code_villages = codes_villages.copy()
    for code in tmp_code_cities:
        match = re.search('/',code)
        if (match != None):
            codes_cities.remove(code)
            split_codes = re.split('/', code)
            for i in split_codes:
                codes_cities.add(i)
    for code in tmp_code_villages:
        match = re.search('/',code)
        if (match != None):
            codes_villages.remove(code)
            split_codes = re.split('/', code)
            for i in split_codes:
                codes_villages.add(i)
    #  call the garbage collector
    gc.collect()
    return codes_cities,codes_villages


def find_zip_codes_by_town(density_threshold,filename='city_information.tsv'):
    codes_cities = set()
    tmp_code_cities = set()
    codes_villages = set()
    tmp_code_villages = set()
    with open(filename, newline='',encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader)
        for row in reader:
            # We keep only the questions with a valid ID and a valid number of rows
            density = -1
            if row[3] != '' and row[4] != '':
                density = float(row[3]) / float(row[4])
            if density >= density_threshold:
                codes_cities.add(row[1])
            elif density != -1: codes_villages.add(row[1])
    tmp_code_cities = codes_cities.copy()
    tmp_code_villages = codes_villages.copy()
    for code in tmp_code_cities:
        match = re.search('-',code)
        if (match != None):
            codes_cities.remove(code)
            split_codes = re.split('-', code)
            for i in split_codes:
                codes_cities.add(i)
    for code in tmp_code_villages:
        match = re.search('-',code)
        if (match != None):
            codes_villages.remove(code)
            split_codes = re.split('-', code)
            for i in split_codes:
                codes_villages.add(i)
    #  call the garbage collector
    gc.collect()
    return codes_cities,codes_villages

def city_village_classifier(density_threshold,data):
    city_zip_codes , village_zip_codes = find_zip_codes_by_town(density_threshold)
    classified_data = []
    class_vector = []
    for entry in data:
        if entry[-1] in city_zip_codes:
            classified_data.append(entry)
            class_vector.append(1)
        elif entry[-1] in village_zip_codes: 
            classified_data.append(entry)
            class_vector.append(-1)
    classified_data = np.array(classified_data)
    classified_data = np.delete(classified_data,len(classified_data[0])-1,1)
    class_vector = np.array(class_vector).reshape((len(class_vector),1))
    classified_data = np.append(classified_data,class_vector,axis=1)
    #  call the garbage collector
    gc.collect()
    return classified_data


In [248]:
classified_data = city_village_classifier(20344,data)

In [249]:
print(classified_data[0])

[{'ecolog': 1, 'transit': 1} 1 {} {} 0 {} 0 {} {} {} 0 {} 0 {} {} {}
 {'ecol': 1, 'enseign': 1, 'select': 1, 'tri': 1}
 {'central': 1, 'geotherm': 1, 'multipli': 1} -1]


In [251]:
def word_count_by_question(data,yes_no_questions):
    word_count = []    
    for i in range(len(data[0])-1):
        if i not in yes_no_questions:
            word_count.append(Counter())
    j = 0
    for i in range(len(data[0])-1):
        if i not in yes_no_questions:
            for entry in column(data,i): word_count[j] += Counter(entry)
            j += 1
    #  call the garbage collector
    gc.collect()
    return word_count

def word_count_total(dictionary_array):
    total_words = Counter()
    for question in dictionary_array: total_words += Counter(question)
    return total_words

    

In [252]:
e = word_count_by_question(classified_data,[1,4,6,10,12])
f = word_count_total(e)

In [253]:
write_data([sort_dictionary(i) for i in e],'word_count_by_question.csv')
write_data(sort_dictionary(f),'word_count_total.csv')

g = read_word_count('word_count_by_question.csv')
h = read_word_count('word_count_total.csv')

In [308]:
def get_most_used_words(data, yes_no_array, word_count_array, number_of_words):
    most_used_words = []
    for question in word_count_array:
        words_by_question = []
        sorted_word_array = sort_dictionary(question)
        for i in range(number_of_words):
            words_by_question.append(sorted_word_array[i])
        most_used_words.append(words_by_question)
    
    filtered_data = []
    
    for entry in data:
        entry_array = []
        j = 0
        for i in range(len(entry)-1):
            if i in yes_no_array:
                entry_array.append(entry[i])
            else:
                words_rep_array = []
                for word in most_used_words[j]:
                    try:
                        words_rep_array.append(entry[i][word[0]])
                    except KeyError as error:
                        words_rep_array.append(0)
                entry_array.append(words_rep_array)
                j += 1
        entry_array.append(entry[-1])
        filtered_data.append(entry_array)
    
    #  call the garbage collector
    gc.collect()
        
    return filtered_data

def get_set_features(data,column):
    a_data = np.array(data)
    features = a_data[:,column]
    features = np.array([np.asarray(i) for i in features])
    return features

In [283]:
filtered_data = get_most_used_words(classified_data,[1,4,6,10,12],e,10)

In [309]:
get_set_features(filtered_data,2)

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 3],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])