# 1- Mise en place de l'environnement de travail

### (a)	Installer Spark 


In [126]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark3.0.0
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
# unzip it
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
# install findspark 
!pip install -q findspark

In [127]:
!pip install pandavro

### (b) Définir la valeur de la variable d’environnement

In [125]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.12:3.0.1 pyspark-shell'
#os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[4] pyspark-shell"

import findspark
findspark.init("spark-3.0.1-bin-hadoop2.7")

### (c)	Créer un objet SparkContext

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F

conf = SparkConf().setAppName("Projet Big Data").set("spark.driver.memory", "10g") 
sc = SparkContext(conf=conf)

In [2]:
sc

### (d) Créer un objet de type SparkSession. 

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()

# 2- Données

### (a) Charger les données dans deux variables de type RDD 

In [5]:
altheism = sc.wholeTextFiles('./data/20_newsgroups/alt.atheism/*')
baseball = sc.wholeTextFiles('./data/20_newsgroups/rec.sport.baseball/*')

In [6]:
altheismPath = altheism.map(lambda a : a[0])
altheismText = altheism.map(lambda a : a[1])

baseballPath = baseball.map(lambda a : a[0])
baseballText = baseball.map(lambda a : a[1])

In [7]:
print(altheism.count())
print(baseball.count())

1000
1000


### (b)	Séparer le corps du message de l’entête.

In [9]:
import re

def find_all(a_str, sub):
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1: return
        yield start
        start += len(sub) # use start += 1 to find overlapping matches

def findSeparator(elem):
    findSeparator = re.search('Lines: [0-9]+\n+',elem)
    if (findSeparator):
        return findSeparator.end()
    else:
        findSeparator = re.search('GMT\n', elem) # Dans quelques documents de baseball il n'y a pas de "Lines: ",  dans ce cas là la délimitation est en fonction de la date
    return findSeparator.end()

def extractHeadAndbody(elem):
    separator = findSeparator(elem)
    return (elem[0:separator].strip(), elem[separator:-1].strip())

In [10]:
altheismSeparate = altheismText.map(extractHeadAndbody)
baseballSeparate = baseballText.map(extractHeadAndbody)

### (e)	Extraire quelques champs de l’entête, par exemple l’organisation et la catégorie (champ “News-groups”).


In [11]:
def extractSomeFieldsHead(elem, category):
    head = elem[0]
    dictInformations = dict()

    newGroup = 'Newsgroups: '
    findNewGroup = head.find(newGroup)
    endNewGroup = head[findNewGroup:-1].find('\n')
    dictInformations['newsGroups'] = head[(findNewGroup + len(newGroup)):(findNewGroup + endNewGroup)].strip()

    lines = 'Lines: '
    findLines = head.find(lines)
    dictInformations['lines'] = head[(findLines + len(lines)):-1].strip()

    organization = 'Organization: '
    findOrganization = head.find(organization)
    endOrganization = head[findOrganization:-1].find('\n')
    dictInformations['organization'] = head[(findOrganization + len(organization)):(findOrganization + endOrganization)].strip()

    subject = 'Subject: '
    findSubject = head.find(subject)
    endSubject = head[findSubject:-1].find('\n')
    dictInformations['subject'] = head[(findSubject + len(subject)):(findSubject + endSubject)].strip()

    date = 'Date: '
    findDate = head.find(date)
    endDate = head[findDate:-1].find('\n')
    dictInformations['date'] = head[(findDate + len(date)):(findDate + endDate)].strip()

    dictInformations['category'] = category

    return (dictInformations, elem[1])

In [12]:
altheismSomeHeadBody = altheismSeparate.map(lambda elem: extractSomeFieldsHead(elem,'altheism'))
baseballSomeHeadBody = baseballSeparate.map(lambda elem: extractSomeFieldsHead(elem,'baseball'))

### (f)	Fusionner les deux RDD


In [13]:
altheismBaseball = altheismSomeHeadBody.union(baseballSomeHeadBody)

### (g)	Transformer le nouveau RDD obtenu pour que chaque élément soit de type pyspark.sql.Row


In [14]:
from pyspark.sql import Row

def elemToRowObject(elem):
    head = elem[0]
    body = elem[1]
    return Row(newsGroups = head['newsGroups'], category = head['category'], lines = head['lines'], organization=head['organization'], subject=head['subject'], date=head['date'], body=body)

In [15]:
rowAltheismBaseball = altheismBaseball.map(elemToRowObject)

### (h)	Créer un objet de type DataFrame à partir du RDD précédent


In [16]:
df = spark.createDataFrame(rowAltheismBaseball)
df.printSchema()
df.show()

root
 |-- newsGroups: string (nullable = true)
 |-- category: string (nullable = true)
 |-- lines: string (nullable = true)
 |-- organization: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- body: string (nullable = true)

+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+
|          newsGroups|category|lines|        organization|             subject|                date|                body|
+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+
|alt.atheism,alt.a...|altheism|   29|Mantis Consultant...|Alt.Atheism FAQ: ...|Mon, 29 Mar 1993 ...|Archive-name: ath...|
|alt.atheism,alt.a...|altheism|   64|Mantis Consultant...|Alt.Atheism FAQ: ...|Mon, 5 Apr 1993 1...|Archive-name: ath...|
|         alt.atheism|altheism|    9|Technical Univers...|   Re: Gospel Dating|Mon, 5 Apr 1993 1...|In article

### (i,j)	Sauvegarder la DataFrame au format Avro et Parquet


In [124]:
df.write.format("avro").save("./output/df.avro")
df.write.parquet("./output/df.parquet")

# 3- Analyse descriptive

### (a)	Vérifier qu’on a bien deux catégories différentes de documents

In [23]:
df.select('category').distinct().show() 

+--------+
|category|
+--------+
|altheism|
|baseball|
+--------+



* Seul deux catégories sont présentes

### (b)	Donner le nombre d’organisations différentes

In [22]:
df.select('organization').distinct().count() 

578

* En tout 578 organisations différentes

### (c) Suivant les champs extraits, donner d’autres statistiques descriptives

In [24]:
df.distinct().count() 

2000

* Nombre de documents : 2000

In [25]:
df.select('subject').distinct().count() 

762

* Le dataset contient 762 sujets différents (sur un total de 2000 documents)

# 4- Transformation du texte et clustering

### (a)	Découper les documents en listes de mots à l’aide de Tokenizer


In [27]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [28]:
tokenizer = Tokenizer(inputCol="body", outputCol="words")
wordsData = tokenizer.transform(df)

In [29]:
listWords = wordsData.select('words').collect()

In [30]:
wordsData.show()

+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|          newsGroups|category|lines|        organization|             subject|                date|                body|               words|
+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|alt.atheism,alt.a...|altheism|   29|Mantis Consultant...|Alt.Atheism FAQ: ...|Mon, 29 Mar 1993 ...|Archive-name: ath...|[archive-name:, a...|
|alt.atheism,alt.a...|altheism|   64|Mantis Consultant...|Alt.Atheism FAQ: ...|Mon, 5 Apr 1993 1...|Archive-name: ath...|[archive-name:, a...|
|         alt.atheism|altheism|    9|Technical Univers...|   Re: Gospel Dating|Mon, 5 Apr 1993 1...|In article <65974...|[in, article, <65...|
|alt.atheism,alt.p...|altheism|    2|Mantis Consultant...|Re: university vi...|Mon, 5 Apr 1993 1...|dmn@kepler.unh.ed...|[dmn@kepler.unh.e...|

In [31]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filteredWords")
wordsFiltered = remover.transform(wordsData)

In [32]:
wordsFiltered.show()

+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|          newsGroups|category|lines|        organization|             subject|                date|                body|               words|       filteredWords|
+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|alt.atheism,alt.a...|altheism|   29|Mantis Consultant...|Alt.Atheism FAQ: ...|Mon, 29 Mar 1993 ...|Archive-name: ath...|[archive-name:, a...|[archive-name:, a...|
|alt.atheism,alt.a...|altheism|   64|Mantis Consultant...|Alt.Atheism FAQ: ...|Mon, 5 Apr 1993 1...|Archive-name: ath...|[archive-name:, a...|[archive-name:, a...|
|         alt.atheism|altheism|    9|Technical Univers...|   Re: Gospel Dating|Mon, 5 Apr 1993 1...|In article <65974...|[in, article, <65...|[article, <65974@...|
|alt.atheism,alt

### (b)	Créer une représentation vectorielle des documents à l’aide de HashingTF


In [33]:
hashingTF = HashingTF(inputCol="filteredWords", outputCol="features", numFeatures=20)
featurizedData = hashingTF.transform(wordsFiltered)

In [34]:
featurizedData.show()

+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|          newsGroups|category|lines|        organization|             subject|                date|                body|               words|       filteredWords|            features|
+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|alt.atheism,alt.a...|altheism|   29|Mantis Consultant...|Alt.Atheism FAQ: ...|Mon, 29 Mar 1993 ...|Archive-name: ath...|[archive-name:, a...|[archive-name:, a...|(20,[0,1,2,3,4,5,...|
|alt.atheism,alt.a...|altheism|   64|Mantis Consultant...|Alt.Atheism FAQ: ...|Mon, 5 Apr 1993 1...|Archive-name: ath...|[archive-name:, a...|[archive-name:, a...|(20,[0,1,2,3,4,5,...|
|         alt.atheism|altheism|    9|Technical Univers...|   Re: Gospel Dat

### (c) Utiliser l’algorithme KMeans avec un nombre du cluster égal à 2

In [36]:
trueCategory = featurizedData.select('category')

In [37]:
from pyspark.sql.functions import *
featuresData = featurizedData.select('features')
trueCategory = trueCategory.withColumn("category", when(col("category") == "altheism", 0).when(col("category") == "baseball",1))

In [38]:
df_trueCategory = trueCategory.select(F.col("category").alias("label"))
dataset = featuresData.join(df_trueCategory)

In [39]:
pandas_dataset_df = dataset.toPandas()

In [93]:
(X, X_test) = dataset.randomSplit([0.7, 0.3], seed = 100)

In [113]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(X)

### (d) Analyser les résultats et la qualité de la partition obtenue

In [115]:
predictions_kmeans = model.transform(X_test).withColumn("prediction",F.col("prediction").cast("double"))

* Accuracy

In [116]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions_kmeans)

0.3350032426420757

L'accuracy obtenue est faible. Sachant qu'il existe 2 cluster, un classifieur naïf basé uniquement sur le hasard (sous l'hypothése i.i.d) arriverait à obtenir de meilleurs résultats.

* NMI

In [118]:
from sklearn.metrics.cluster import normalized_mutual_info_score
normalized_mutual_info_score(y_test, predictions_kmeans)

8.257279428633879e-07

La NMI est très proche de zéro, cela confirme le résulat précédent et indique un mauvais partionnement.

### (e) Comparer avec ce qu’on aurait obtenu avec l’implémentation de k-means du package scikit-learn.

In [68]:
from sklearn.model_selection import train_test_split
import numpy as np

series = pandas_dataset_df['features'].apply(lambda x : np.array(x.toArray())).to_numpy().reshape(-1,1)
features = np.apply_along_axis(lambda x : x[0], 1, series)
x_train, x_test, y_train, y_test = train_test_split(features, pandas_dataset_df['label'] ,test_size=0.3)



In [69]:
from sklearn.cluster import KMeans
sklearn_kmeans = KMeans(
    n_clusters=2, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)
sklearn_kmeans_model = sklearn_kmeans.fit(x_train)

In [70]:
sklearn_kmeans_predictions = sklearn_kmeans_model.predict(x_test)

* Classification metrics

In [71]:
from sklearn.metrics import classification_report
print(classification_report(y_test, sklearn_kmeans_predictions))

              precision    recall  f1-score   support

           0       0.49      0.01      0.02    598837
           1       0.50      0.99      0.67    601163

    accuracy                           0.50   1200000
   macro avg       0.50      0.50      0.34   1200000
weighted avg       0.50      0.50      0.34   1200000



Les résulats obtenus refletent une classification similaire à celle qu'on obtiendrai en nous basant uniquement sur le hasard. Ces résultats sont cependant meilleurs que ceux résultant d'un kmeans implémenté par spark.

* NMI

In [111]:
normalized_mutual_info_score(y_test, sklearn_kmeans_predictions)

8.257279428633879e-07

La NMI est exactement la même que celle obtenue précédemment.

# 5- Implémentation de K-means unidimensionnel

### (a) Définir la fonction compute_centroids qui prend en argument deux RDD (points ,cluster_ids)

In [42]:
def compute_centroids(points,cluster_ids):
    #poinds et cluster_ids sont deux rdd
    tuple_cluster_points = cluster_ids.zip(points)
    sum_by_cluster_id = tuple_cluster_points.reduceByKey(lambda a,b: a+b)
    count_by_cluster_id = sc.parallelize(tuple_cluster_points.countByKey().items())
    tmp = sum_by_cluster_id.union(count_by_cluster_id)
    result = tmp.reduceByKey(lambda a,b: a/b)
    return result

### (b) Définir la fonction assign_clusters qui prend en argument deux RDDs (points,centroids) : 

In [43]:
def squared_distances(point,listMean):
    result = [0] * len(listMean)
    for i, value in enumerate(listMean):
        result[i] = (value - point) ** 2
    return result



In [44]:
import numpy as np
def assign_clusters(points,centroids):
    clusters = centroids.map(lambda a : a[0]).collect()
    means = centroids.map(lambda a : a[1]).collect() 
    distances = [squared_distances(point, means) for point in points.collect()]
    return sc.parallelize([np.argmin(distance) for distance in distances])

### (c) Implémenter l’étape d’initialisation et l’itération.

In [104]:
def initCentroids(data, numClusters):
    sample = sc.parallelize(data.sample(False, 0.1).limit(1))
    centroids = sample.map(lambda point : point[1][:-1])
    return centroids.zipWithIndex().map(lambda point : (point[1], point[0]))



def kmeans(data, numClusters, nb_iter=10):
    centroids = initCentroids(data, numClusters)
    iterations = 0
    while iterations != nb_iter:
        iterations += 1
        dataMinDistance = assign_clusters(data, centroids)
        newCentroids = compute_centroids(dataMinDistance)
        intraClusterDistances = squared_distances(dataMinDistance)

        centroids = sc.parallelize(newCentroids.collect())
    clusters = assign_clusters(data, centroids)
    return clusters

# 6- Spherical k-means et k-means multidimensionnel

### (a) Adapter l’implémentation précédente pour gérer le cas multidimensionnel


In [45]:
def compute_centroids_multi(points, cluster_ids):
    #poinds et cluster_ids sont deux rdd
    tuple_cluster_points = cluster_ids.zip(points)
    sum_by_cluster_id = tuple_cluster_points.reduceByKey(lambda a,b: np.array(a)+np.array(b))
    count_by_cluster_id = sc.parallelize(tuple_cluster_points.countByKey().items())
    tmp = sum_by_cluster_id.union(count_by_cluster_id)
    result = tmp.reduceByKey(lambda a,b: np.array(a)  / np.array(b))
    return result

In [46]:
def squared_distances_multi(point,listMean):
    result = [[0 for j in range(len(point))] for i in range(len(listMean))]
    for i, mean in enumerate(listMean):
        for j, coordPoint in enumerate(point):
            result[i] = (mean[j] - coordPoint) ** 2
    return result #liste des carrés des distances entre le point et les différentes moyennes des clusters, REVOIR SI C'EST ÇA ?

In [47]:
def assign_clusters_multi(points,centroids):
    clusters = centroids.map(lambda a : a[0]).collect()
    means = centroids.map(lambda a : a[1]).collect() #Quelle supposition a-t-on faite pour effectuer cette opération ? --> Que les valeurs sont ordonnées (cluster 0 puis cluster 1...)
    distances = [squared_distances(point, means) for point in points.collect()]
    return sc.parallelize([np.argmin(distance) for distance in distances])

In [120]:
def kmeans_multi(data, numClusters, nb_iter=10):
    centroids = initCentroids(data, numClusters)
    iterations = 0
    while iterations != nb_iter:
        iterations += 1
        dataMinDistance = assign_clusters_multi(data, centroids)
        newCentroids = compute_centroids_multi(dataMinDistance)
        intraClusterDistances = squared_distances_multi(dataMinDistance)

        centroids = sc.parallelize(newCentroids.collect())
    clusters = assign_clusters(data, centroids)
    return clusters

### (b)	Adapter l’implémentation précédente pour implémenter Spherical k-means


In [None]:
from scipy.spatial import distance
def cosine_distances_multi(point,listMean):
    result = [[0 for j in range(len(point))] for i in range(len(listMean))]
    for i, mean in enumerate(listMean):
        for j, coordPoint in enumerate(point):
            result[i] = distance.cosine(mean[j], coordPoint)
    return result 
def skmeans_assign_clusters_multi(points,centroids):
    clusters = centroids.map(lambda a : a[0]).collect()
    means = centroids.map(lambda a : a[1]).collect() #Quelle supposition a-t-on faite pour effectuer cette opération ? --> Que les valeurs sont ordonnées (cluster 0 puis cluster 1...)
    distances = [squared_distances(point, means) for point in points.collect()]
    return sc.parallelize([np.argmax(distance) for distance in distances])

def skmeans_multi(data, numClusters, nb_iter=10):
    centroids = initCentroids(data, numClusters)
    iterations = 0
    while iterations != nb_iter:
        iterations += 1
        dataMinDistance = skmeans_assign_clusters_multi(data, centroids)
        newCentroids = compute_centroids_multi(dataMinDistance)
        intraClusterDistances = cosine_distances_multi(dataMinDistance)

        centroids = sc.parallelize(newCentroids.collect())
    clusters = assign_clusters(data, centroids)
    return clusters

### (c) Analyser les résultats et comparer à l’implémentation de Spherical k-means du package Coclust

In [123]:
skmeans_implementation_labels = skmeans_multi(X,numClusters=2)
y_pred_skmeans = skmeans_implementation_labels.toPandas()

* Classification metrics

In [121]:
print(classification_report(y_train, skmeans_implementation_labels))

              precision    recall  f1-score   support

           0       0.50      0.41      0.45   1401163
           1       0.50      0.59      0.54   1398837

    accuracy                           0.50   2800000
   macro avg       0.50      0.50      0.50   2800000
weighted avg       0.50      0.50      0.50   2800000



Les résulats obtenus avec le spherical K-means sont plus équilibrés au niveau des metriques de recall et de précision. Cela a pour effet directe d'augmenter le f1-score. 
Ces résultats sont toutefois peu satisfaisants au regard des exigences généralement attendues dans le  pour ce type de traitement de données.

* NMI

In [122]:
normalized_mutual_info_score(y_train, skmeans_implementation_labels)

3.0664663067611274e-08

La NMI est quant à elle inférieure à celle obtenue en utilisant un kmeans classique. Ce qui suggérait un partionnement encore plus médiore.

#### Comparaison avec l’implémentation de Spherical k-means du package Coclust 

In [76]:
from coclust.clustering.spherical_kmeans import *

In [78]:
coclust_skmeans = SphericalKmeans(n_clusters=2, init=None, max_iter=20)

In [79]:
coclust_skmeans.fit(x_train)

 == New init == 
iteration: 0
2502695.2083473476
iteration: 1
2589355.9742346746
iteration: 2
2590001.15265664
iteration: 3
2590101.1862632902
iteration: 4
2590139.303510783
iteration: 5
2590145.6502014874
iteration: 6
2590146.5651609236
iteration: 7
2590146.8472422464
iteration: 8
2590147.3171468326
iteration: 9
2590148.0700637097
iteration: 10


* Classifcation metrics

In [86]:
from sklearn.metrics import classification_report
print(classification_report(y_train, coclust_skmeans.labels_))

              precision    recall  f1-score   support

           0       0.50      0.41      0.45   1401163
           1       0.50      0.59      0.54   1398837

    accuracy                           0.50   2800000
   macro avg       0.50      0.50      0.50   2800000
weighted avg       0.50      0.50      0.50   2800000



* NMI

In [119]:
normalized_mutual_info_score(y_train, coclust_skmeans.labels_)

3.0664663067611274e-08

Les perfomances obtenues sont très similaires, ce qui indique que notre travail d'implémentation de skmeans est correspond bien la formulation de l'algorithme.

# 7-  Autres Classifications

Dans cette partie, nous expérimentrons différentes approches de classification dans le but d'améliorer les performances obtenues précédemment.

## Clustering

### GMM

In [110]:
from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture().setK(2).setSeed(442)
model_gmm = gmm.fit(X)
predictions_gmm = model_gmm.transform(X_test).withColumn("prediction",F.col("prediction").cast("double"))

In [114]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions_gmm)

0.3328925536524998

## Bisecting k-means

In [117]:
from pyspark.ml.clustering import BisectingKMeans

bkm = BisectingKMeans().setK(2).setSeed(1)
model_bkm = bkm.fit(X)

# Make predictions
predictions_bkm = model_bkm.transform(X_test).withColumn("prediction",F.col("prediction").cast("double"))

In [118]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions_bkm)

0.3396880201332832

### Classification supervisée

#### Random Forest

In [121]:
from pyspark.ml.classification import RandomForestClassifier

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

# Train model.  This also runs the indexers.
model_rf = rf.fit(X)

# Make predictions.
predictions_rf = model_rf.transform(X_test)


In [122]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions_rf)

0.4940638977620505

### MLP

In [146]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [10, 5, 4, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model_mlp = trainer.fit(X)

# compute accuracy on the test set
result = model_mlp.transform(X_test)

In [122]:
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(str(evaluator.evaluate(predictionAndLabels)))

0.4940638977620505

### Gradient boosting

In [151]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)


# Train model.  This also runs the indexers.
model_gbt = gbt.fit(X)

# Make predictions.
predictions_gbt = model_gbt.transform(X_test)

In [152]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions_gbt)

0.49182917809562376

## Bilan

* Les résulats obtenus par les différentes approches de classifications (supervisée ou non) sont relativements similaires.

* Aucune approche ne se démarque clairement pour classifier ce jeu de données.
* Un classifieur aléatoire sous l'hypothèse que les données sont i.i.d pourrait obtenir des performances similaires.