In [1]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import scipy.sparse as sp

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ofercoq/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.stem.snowball import FrenchStemmer
from nltk.tokenize import word_tokenize

In [4]:
stemmer = FrenchStemmer()

In [5]:
path = "./"  # "/data/ofercoq/datasets/granddebat/"
file = "la-transition-ecologique.csv"
df = pd.read_csv(path+file, low_memory=False)
pd.set_option('display.max_columns', 27)

In [6]:
train_data = df[df.columns[[2, 11, 12, 16, 17, 18, 20, 22, 23, 24, 25, 26]]]
train_labels = df[df.columns[13]]

In [7]:
no_answer = train_labels.isnull()
train_data = train_data[~no_answer]
train_labels = train_labels[~no_answer]
train_data[train_data.isnull()] = ''

In [8]:
train_data_concatenated = train_data[train_data.columns[0]]
for i in range(train_data.shape[1]-1):
    train_data_concatenated += " "+train_data[train_data.columns[i+1]]
train_data_concatenated_array = np.array(train_data_concatenated)
train_data_concatenated_array.shape

(138562,)

In [9]:
#train_data_concatenated_array_stemmed = train_data_concatenated_array.copy()
#for i in range(train_data_concatenated_array.shape[0]):
#    train_data_concatenated_array_stemmed[i] = ""
#    for word in word_tokenize(train_data_concatenated_array[i]):
#        train_data_concatenated_array_stemmed[i] += stemmer.stem(word) + " "

In [10]:
def tokenize_map(text):
    tokens = nltk.word_tokenize(text)
    stems = list(map(stemmer.stem, tokens))
    return stems

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = tokens
    i = 0
    for item in tokens:
        stems[i] = stemmer.stem(item)
        i = i+1
    return stems

In [11]:
tf_transformer = TfidfVectorizer(tokenizer=tokenize_map, 
                                 max_df=0.9, min_df=500, stop_words=["'", "''", '!', '%', '#', '*',
                        '&', '(', ')', '+', '-', '.', '..', '...', '/', ':', ';', '=', '>', '?', '``', 'a'])
TF_IDF_matrix = tf_transformer.fit_transform(train_data_concatenated)
TF_IDF_feature_names = tf_transformer.get_feature_names()

In [12]:
# TF_IDF_feature_names

In [13]:
np.array(TF_IDF_feature_names)[np.nonzero(TF_IDF_matrix[1542])[1]]

array(['solidaire', 'mobilités', 'appui', 'avait', 'sites', 'privilégie',
       'change', 'prenant', 'preuve', 'tard', 'devoir', 'attend',
       'majorité', 'public', 'deja', 'véritable', 'circuits',
       'réellement', 'privés', 'initiatives', 'pense', 'couvert',
       'important', 'mette', 'sélectif', 'fenêtres', 'nationale', 'sera',
       'transition', 'aides', 'ça', 'évite', 'logement', 'été',
       'changement', 'courts', 'trop', 'déplacements', 'écologie', 'fais',
       'mon', 'tri', 'faire', 'isolation', 'elles', 'si', 'utilise',
       'voiture', 'marche', 'temps', 'forme', 'service', 'problème',
       'cela', 'monde', 'mise', 'locales', 'ont', 'ces', 'idées',
       'bonnes', 'état', 'déjà', 'alimentation', 'toute', 'sous',
       'passer', 'pourquoi', 'ce', 'toutes', 'je', 'qu', 'beaucoup', 'ai',
       'mettre', 'autres', 'que', 'on', 'est', 'tout', 'économique',
       'politique', 'qui', 'mais', 'par', 'sur', 'écologique', 'un', 'du',
       'une', 'pas', 'plus', '

In [14]:
TF_IDF_matrix[1542][np.nonzero(TF_IDF_matrix[1542])]

matrix([[0.13962582, 0.14714463, 0.15782209, 0.11884102, 0.12228986,
         0.11587404, 0.1236382 , 0.12588589, 0.12831158, 0.11875968,
         0.12900338, 0.14621128, 0.11824537, 0.18139695, 0.13414498,
         0.11057821, 0.09045538, 0.10221698, 0.11494557, 0.10907544,
         0.08526128, 0.16112795, 0.09230105, 0.14469918, 0.08073706,
         0.11437492, 0.1045019 , 0.09165232, 0.12023047, 0.06556716,
         0.08561181, 0.11028289, 0.08807074, 0.08617632, 0.07529419,
         0.17765961, 0.06607831, 0.06637952, 0.07283861, 0.07053236,
         0.10802259, 0.05616935, 0.12770939, 0.0663099 , 0.08607056,
         0.05571996, 0.07525806, 0.04641824, 0.08504549, 0.07186995,
         0.11528173, 0.17900767, 0.07802197, 0.06022736, 0.06978846,
         0.08047998, 0.08038063, 0.06844579, 0.05794964, 0.11198053,
         0.108003  , 0.06018817, 0.21835508, 0.09023761, 0.07850986,
         0.08584118, 0.08317245, 0.08131616, 0.04406873, 0.06822774,
         0.19460176, 0.04835351, 0

In [29]:
sp.save_npz('tfidf_matrix.npz', TF_IDF_matrix)
np.save('feature_names.npy', TF_IDF_feature_names)
np.save('train_labels.npy', np.array(train_labels))
# sparse_matrix = scipy.sparse.load_npz('/tmp/sparse_matrix.npz')