# SVM en pratique

On compare l'algorithme SVM au NaiveBayes sur les jeux de données:

- detection de langue
- data type
- iris (voir dans le dépôt MachineLearning: https://github.com/Evanior/MachineLearning)


In [1]:
from sklearn.svm import NuSVC


In [2]:
# On importe les données pour le projet "Data Type"
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

ref = {
    "0": "float",
    "1": "int",
    "2": "Code postal",
    "3": "Coordonnées GPS",
    "4": "Adresse",
    "5": "SIRET",
    "6": "Année",
    "7": "Date",
    "8": "SIREN",
    "9": "Code NAF",
    "10": "Autre",
    "11": "Telephone",
}

y_vector = []
raw_documents_vector = []

file_name = "../../all_data/data_for_project_datatype.txt"
with open(file_name, "rt") as f:
    for i,line in enumerate(f):
        data = line.split(" ", maxsplit=1)
        y = data[0]
        text = data[1].strip()
        raw_document = str(len(text)) + " " + text
        if y != "" and raw_document != "":
            y_vector.append(y)
            raw_documents_vector.append(raw_document)

N_SAMPLES = len(y_vector)
print(N_SAMPLES, "samples")

368778 samples


In [3]:
# on construit le pipeline
NGRAM_RANGE = (1, 3)

count_vectorizer = CountVectorizer(
    input="content", 
    analyzer="char_wb", 
    ngram_range=NGRAM_RANGE, 
    stop_words=None,
    binary=True)

In [5]:
# on prépare les données d'entrainement et de test
X_train, X_test, Y_train, Y_test =  train_test_split(
    raw_documents_vector, 
    y_vector, 
    shuffle=True, 
    test_size=0.3, 
    train_size=None)

In [7]:
# On compare NuSVC au naive bayes
from sklearn.svm import NuSVC, SVC, LinearSVC
from datetime import datetime
import numpy as np

# on construit le classifier SVM lineaire
svc_classifier = LinearSVC()

# on définit les pipelines qu'on veut comparer
pipelines = [
    Pipeline([("vectorizer", count_vectorizer), ("classifier", MultinomialNB())]),
    Pipeline([("vectorizer", count_vectorizer), ("classifier", svc_classifier)])
]

for i, pipeline in enumerate(pipelines):
    start = datetime.now()
    print("pipeline", i)
    print("start training")
    pipeline.fit(X_train, Y_train)
    print("training done !")
    print("duration", (datetime.now() - start).seconds, "secondes")
    
    # on prédit sur l'échantillon de test
    predicted = pipeline.predict(X_test)
    
    # on affiche les résultats
    print("correct ones", np.sum(predicted == Y_test))
    print("false ones", np.sum(predicted != Y_test))
    print("ratio:", np.mean(predicted == Y_test))


pipeline 0
start training
training done !
duration 16 secondes
correct ones 110427
false ones 207
ratio: 0.9981289657790553
pipeline 1
start training
training done !
duration 43 secondes
correct ones 110620
false ones 14
ratio: 0.9998734566227381


## On teste avec la détection de langue

In [8]:
# on charge les données
from sklearn.feature_extraction.text import CountVectorizer

# l'entée
raw_documents_vector = []
# la sortie:
y_vector = []

with open("data.txt", "rt", encoding="utf8") as f:
    for line in f:
        y = line[0:1]
        doc = line[2:]
        y_vector.append(y)
        raw_documents_vector.append(doc)

print("samples", len(y_vector))       


samples 12915


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(
    raw_documents_vector, y_vector, test_size=0.3, train_size=None)


In [None]:
NGRAM_RANGE = (1, 2)

vectorizer = CountVectorizer(
    input="content", 
    analyzer="word", 
    ngram_range=NGRAM_RANGE,
    stop_words=None,
    binary=True
)

clf_nb = MultinomialNB()

pipelines = [
    Pipeline([("vectorizer", vectorizer), ("multinomial naive bayes", clf_nb)])
]

for i,pipeline in enumerate(pipelines):
    start = datetime.now()
    print("pipeline", i)
    print("start training")
    pipeline.fit(X_train, Y_train)
    print("duration", (datetime.now() - start).seconds, "secondes")
    
    
    

pipeline 0
start training


## Autre jeu de données: les iris



