# SVM en pratique

On compare l'algorithme SVM au NaiveBayes sur les jeux de données:

- detection de langue
- data type
- iris (voir dans le dépôt MachineLearning: https://github.com/Evanior/MachineLearning)


In [2]:
from sklearn.svm import NuSVC


In [3]:
# On importe les données pour le projet "Data Type"
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

ref = {
    "0": "float",
    "1": "int",
    "2": "Code postal",
    "3": "Coordonnées GPS",
    "4": "Adresse",
    "5": "SIRET",
    "6": "Année",
    "7": "Date",
    "8": "SIREN",
    "9": "Code NAF",
    "10": "Autre",
    "11": "Telephone",
}

y_vector = []
raw_documents_vector = []

file_name = "../../all_data/data_for_project_datatype.txt"
with open(file_name, "rt") as f:
    for i,line in enumerate(f):
        data = line.split(" ", maxsplit=1)
        y = data[0]
        text = data[1].strip()
        raw_document = str(len(text)) + " " + text
        if y != "" and raw_document != "":
            y_vector.append(y)
            raw_documents_vector.append(raw_document)

N_SAMPLES = len(y_vector)
print(N_SAMPLES, "samples")

368778 samples


In [4]:
# on construit le pipeline
NGRAM_RANGE = (1, 3)

count_vectorizer = CountVectorizer(
    input="content", 
    analyzer="char_wb", 
    ngram_range=NGRAM_RANGE, 
    stop_words=None,
    binary=True)

In [5]:
# on prépare les données d'entrainement et de test
X_train, X_test, Y_train, Y_test =  train_test_split(
    raw_documents_vector, 
    y_vector, 
    shuffle=True, 
    test_size=0.3, 
    train_size=None)

In [6]:
# On compare NuSVC au naive bayes
from sklearn.svm import NuSVC, SVC, LinearSVC
from datetime import datetime
import numpy as np

# on construit le classifier SVM lineaire
svc_classifier = LinearSVC()

# on définit les pipelines qu'on veut comparer
pipelines = [
    Pipeline([("vectorizer", count_vectorizer), ("classifier", MultinomialNB())]),
    Pipeline([("vectorizer", count_vectorizer), ("classifier", svc_classifier)])
]

for i, pipeline in enumerate(pipelines):
    start = datetime.now()
    print("pipeline", i)
    print("start training")
    pipeline.fit(X_train, Y_train)
    print("training done !")
    print("duration", (datetime.now() - start).seconds, "secondes")
    
    # on prédit sur l'échantillon de test
    predicted = pipeline.predict(X_test)
    
    # on affiche les résultats
    print("correct ones", np.sum(predicted == Y_test))
    print("false ones", np.sum(predicted != Y_test))
    print("ratio:", np.mean(predicted == Y_test))


pipeline 0
start training
training done !
duration 21 secondes
correct ones 110434
false ones 200
ratio: 0.9981922374676863
pipeline 1
start training
training done !
duration 39 secondes
correct ones 110616
false ones 18
ratio: 0.9998373013720918


## On teste avec la détection de langue

In [7]:
# on charge les données
from sklearn.feature_extraction.text import CountVectorizer

# l'entée
raw_documents_vector = []
# la sortie:
y_vector = []

with open("data.txt", "rt", encoding="utf8") as f:
    for line in f:
        y = line[0:1]
        doc = line[2:]
        y_vector.append(y)
        raw_documents_vector.append(doc)

print("samples", len(y_vector))       


samples 12915


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(
    raw_documents_vector, y_vector, test_size=0.3, train_size=None)


In [None]:
NGRAM_RANGE = (1, 2)

vectorizer = CountVectorizer(
    input="content", 
    analyzer="word", 
    ngram_range=NGRAM_RANGE,
    stop_words=None,
    binary=True
)

clf_nb = MultinomialNB()
clf_svm = LinearSVC()

pipelines = [
    Pipeline([("vectorizer", vectorizer), ("multinomial naive bayes", clf_nb)]),
    Pipeline([("vectorizer", vectorizer), ("Linear SVM", clf_svm)]),
]

for i,pipeline in enumerate(pipelines):
    start = datetime.now()
    print("pipeline", i)
    print("start training")
    pipeline.fit(X_train, Y_train)
    print("duration", (datetime.now() - start).seconds, "secondes")
    # on prédit sur l'échantillon de test
    predicted = pipeline.predict(X_test)
    
    # on affiche les résultats
    print("correct ones", np.sum(predicted == Y_test))
    print("false ones", np.sum(predicted != Y_test))
    print("ratio:", np.mean(predicted == Y_test))
    

pipeline 0
start training
duration 136 secondes
correct ones 3860
false ones 15
ratio: 0.9961290322580645
pipeline 1
start training
duration 141 secondes


## Autre jeu de données: les iris (les fleurs)

3 espèces et 4 dimensions.

In [None]:
import pandas as pd

df = pd.read_csv("../../all_data/iris.csv")
df.head()

In [None]:
df["species"].unique()

In [None]:
import seaborn as sn

In [None]:
sn.pairplot(data=df, hue="species", diag_kind="hist")

In [None]:
# on ne considère que 2 dimensions: petal length et petal width
data = df[["petal_length", "petal_width", "species"]]
# on garde uniquement versicolor et setosa
data = data.loc[data["species"] != "virginica"]


In [None]:
sn.pairplot(data, hue="species")

In [None]:
# on divise l'échantillon
X_train, X_test, Y_train, Y_test = train_test_split(
    data[["petal_length", "petal_width"]], data["species"], 
    shuffle=True, test_size=0.3, train_size=None)

print("train", X_train.size)
print("test", X_test.size)
data.head()

In [None]:
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt

# on test le naive bayes et les différents SVM
from sklearn.neighbors import KNeighborsClassifier
pipelines = [
    Pipeline([("Naive Bayes", MultinomialNB())]), 
    Pipeline([("Linear SVC", LinearSVC())]),
    Pipeline([("KNN", KNeighborsClassifier(n_neighbors=5, algorithm="kd_tree"))]),
]

# on convertit la "DataFrame" de pandas en numpy Array
X = data[["petal_length", "petal_width"]].values
Y = data["species"].values
Y_as_int = np.empty(shape=Y.shape, dtype=int)
# on convertit les classes en entiers: "setosa" -> 0, "versicolor" -> 1
for i,value in enumerate(Y):
    Y_as_int[i] = 0 if Y[i] == "setosa" else 1
    
for i,pipeline in enumerate(pipelines):
    print("-"*50)
    start = datetime.now()
    print("pipeline", pipeline.steps[0][0])
    print("start training")
    pipeline.fit(X_train, Y_train)
    print("duration", (datetime.now() - start).seconds, "secondes")
    # on prédit sur l'échantillon de test
    predicted = pipeline.predict(X_test)
    
    # on affiche les résultats
    print("correct ones", np.sum(predicted == Y_test))
    print("false ones", np.sum(predicted != Y_test))
    print("ratio:", np.mean(predicted == Y_test))
    
    clf = pipeline.steps[0][1]
    print(clf)
    fig = plt.figure()
    pipeline.fit(X, Y_as_int)
    plot_decision_regions(X=X, y=Y_as_int, clf=clf, legend=2)
