# Project DataType

Les étapes:

- lire le fichier data_for_project_datatype.txt
- extraire les données entrées/sorties (liste de documents et liste des classes)
- on construit le pipeline
    - on va tester plusieurs vectorizers: CountVectorizer (N-grams, Binary ou pas, etc), TFIDFVectorizer. On utilise la granularité du caractère.
    - on utilise le Bayesian classifier: MultinomialNB.
- on vérifie à l'oeil si ca marche ou pas
- on fait varier les hyperparamètres pour trouver le meilleur score (cross validation score).




In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


ref = {
    "0": "float",
    "1": "int",
    "2": "Code postal",
    "3": "Coordonnées GPS",
    "4": "Adresse",
    "5": "SIRET",
    "6": "Année",
    "7": "Date",
    "8": "SIREN",
    "9": "Code NAF",
    "10": "Autre",
    "11": "Telephone",
}

y_vector = []
raw_documents_vector = []

file_name = "./data/project_data_type/data_for_project_datatype.txt"
with open(file_name, "rt") as f:
    for i,line in enumerate(f):
        data = line.split(" ", maxsplit=1)
        y = data[0]
        text = data[1].strip()
        raw_document = str(len(text)) + " " + text
        # raw_document = text
        
        # if i%1000 == 0:
        #    print(ref[y], raw_document)
        # raw_document = line[2:]
        # je garde les données dans des listes
        if y != "" and raw_document != "":
            y_vector.append(y)
            raw_documents_vector.append(raw_document)

print(len(y_vector), "données")

# on construit le pipeline
NGRAM_RANGE = (1, 3)

count_vectorizer = CountVectorizer(
    input="content", 
    analyzer="char_wb", 
    ngram_range=NGRAM_RANGE, 
    stop_words=None,
    binary=True)

tfidf_vectorizer = TfidfVectorizer(
    input="content", 
    analyzer="char", 
    ngram_range=NGRAM_RANGE, 
    stop_words=None
)

pipeline = Pipeline([("vectorizer", count_vectorizer), ("classifier", MultinomialNB())])

def get_text_length(x):
    return np.array([len(t) for t in x]).reshape(-1, 1)

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer

# pipeline = Pipeline([
#     ('features', FeatureUnion([
#         ('text', Pipeline([
#             ('vectorizer', count_vectorizer),
#         ])),
#         ('length', Pipeline([
#             ('count', FunctionTransformer(get_text_length, validate=False)),
#         ]))
#     ])),
#     ('clf', MultinomialNB() )])

# on divise nos données en échantillon d'entrainement et de test
X_train, X_test, Y_train, Y_test = train_test_split(
    raw_documents_vector, 
    y_vector, 
    shuffle=True, 
    test_size=0.2, 
    train_size=None)

print("données entrainement:", len(X_train))
print("données de test     :", len(X_test))

# on entraine le pipeline
print("start training")
pipeline.fit(X_train, Y_train)
print("training done !")

import numpy as np

# on prédit sur l'échantillon de test
predicted = pipeline.predict(X_test)

print("correct ones", np.sum(predicted == Y_test))
print("false ones", np.sum(predicted != Y_test))
print("ratio:", np.mean(predicted == Y_test))

368778 données
données entrainement: 295022
données de test     : 73756
start training
training done !
correct ones 73605
false ones 151
ratio: 0.9979527089321547


In [16]:
print("score on test:", pipeline.score(X_test, Y_test))
print("score on train:", pipeline.score(X_train, Y_train))

score on test: 0.9979527089321547
score on train: 0.9985221441112866


In [13]:
# on affiche N mots du vocabulaire
i_max = 10
for i, word in enumerate(count_vectorizer.vocabulary_.keys()):
    if i > i_max:
        break
    else:
        print(word)

 
8
 8
8 
 8 
2
0
1
3
9
 2


In [17]:
for i in list(np.where(predicted != Y_test))[0]:
    pass
    # print(i, "|", X_test[i][0:100].strip(), "|", ref[str(Y_test[i])],"-> predicted: ", ref[str(predicted[i])])

In [8]:
data = input("Saisir: ")
data = str(len(data))+" "+data
print(data)
print("classe:", ref[pipeline.predict([data])[0]])

9 566201120
classe: Code postal


## On teste une autre vectorisation

- On fixe la taille du vecteur d'entrée.
- On enlève les espaces du texte
- Le texte est directement mis en entrée


In [91]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np

ref = {
    "0": "float",
    "1": "int",
    "2": "Code postal",
    "3": "Coordonnées GPS",
    "4": "Adresse",
    "5": "SIRET",
    "6": "Année",
    "7": "Date",
    "8": "SIREN",
    "9": "Code NAF",
    "10": "Autre",
    "11": "Telephone",
}

y_vector = []
raw_documents_vector = []

file_name = "./data/project_data_type/data_ready.txt"
with open(file_name, "rt") as f:
    for i,line in enumerate(f):
        data = line.split(" ", maxsplit=1)
        y = data[0]
        raw_document = line
        
        if y != "" and raw_document != "":
            y_vector.append(y)
            raw_documents_vector.append(raw_document)

print(len(y_vector), "données")

358920 données


In [92]:
print(raw_documents_vector[0])

8 435315742



In [93]:
print(max([len(doc) for doc in raw_documents_vector]))

41


In [103]:
# on écrit en dur la taille des données
# le nombre de caractères max dans notre texte
N_FEATURES = 50
# le nombre de documents dans le corpus
N_SAMPLES = len(raw_documents_vector)

# pour convertir les caractères en entier
CHARS = "abcdefghijklmnopqrstuvwxyz0123456789 .,-/"

DEFAULT_IF_ABSENT = 50

# on prépare la matrice d'entiers N_SAMPLES lignes et N_FEATURES colonnes
X = np.empty(shape=(N_SAMPLES, N_FEATURES), dtype=int)
X.fill(DEFAULT_IF_ABSENT)
# on prépare la sortie: le vecteur des différentes classes
y = [None]*len(raw_documents_vector)

def vectorize(text):
    text = text.lower().strip()
    vec = np.empty(shape=N_FEATURES, dtype=int)
    vec.fill(DEFAULT_IF_ABSENT)
    for i_feature, carac in enumerate(text):
        char_index = CHARS.find(carac)
        if char_index > -1:
            # on a trouvé le caractère
            vec[i_feature] = char_index
        else:
            # on met une valeur "VIDE"
            vec[i_feature] = DEFAULT_IF_ABSENT
    return vec[:]

print(vectorize("5 rue du temple"))

[31 36 17 20  4 36  3 20 36 19  4 12 15 11  4 50 50 50 50 50 50 50 50 50
 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50
 50 50]


In [110]:

# on boucle sur les documents du corpus
for i_sample, doc in enumerate(raw_documents_vector):
    # on récupère la classe du document
    doc_class = None
    # y[i_sample] = doc[0:2].strip()
    
    # on vectorize le text
    X[i_sample, :] = vectorize(doc[2:])

    if i_sample%100000 == 0:
        print("-"*50)
        print("doc", doc)
        print("y", y[i_sample])
        print("text", doc[2:])
        print("X", X[i_sample])

y = np.array(y, dtype=str)

--------------------------------------------------
doc 8 435315742

y 8
text 435315742

X [30 29 31 29 27 31 33 30 28 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50
 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50
 50 50]
--------------------------------------------------
doc 4 5 RUE DE LA PERRIERE

y 4
text 5 RUE DE LA PERRIERE

X [31 36 17 20  4 36  3  4 36 11  0 36 15  4 17 17  8  4 17  4 50 50 50 50
 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50
 50 50]
--------------------------------------------------
doc 3 1.07184 49.33819

y 3
text 1.07184 49.33819

X [27 37 26 33 27 34 30 36 30 35 37 29 29 34 27 35 50 50 50 50 50 50 50 50
 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50
 50 50]
--------------------------------------------------
doc 3 5.32313 43.36364

y 3
text 5.32313 43.36364

X [31 37 29 28 29 27 29 36 30 29 37 29 32 29 32 30 50 50 50 50 50 50 50 50
 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 

In [112]:
# On divise notre échantillon total en une sous partie pour l'apprentissage
# et une sous partie pour le test

X_train, X_test, Y_train, Y_test = train_test_split(
    X, 
    y, 
    shuffle=True, 
    test_size=0.3, 
    train_size=None)

nb = MultinomialNB()
nb.fit(X_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [123]:
predictions = nb.predict(X_test)

total_good = (predictions == Y_test).sum()
total = Y_test.shape[0]
print(total, total_good)
print("good ratio", total_good/float(total))

errors_index = np.where(predictions != Y_test)
for error_index in errors_index:
    print(X_test[error_index, :])

# output = nb.predict(np.transpose(vectorize("place super 49756 angers")[:, np.newaxis]))
# print(output)

107676 93372
good ratio 0.8671570266354619
[[28 26 27 ... 50 50 50]
 [28 29 35 ... 50 50 50]
 [28 26 27 ... 50 50 50]
 ...
 [28 26 27 ... 50 50 50]
 [28 26 27 ... 50 50 50]
 [28 26 27 ... 50 50 50]]
