# CNAM UASB03 - CERTIFICATION ANALYSE DE DONNEES MASSIVES
## Projet d'analyse de sentiment sur les commentaires Airbnb en français

***
Notebook Scala de modélisation à partir des données vectorisées précedemment constituées.

4 modèles sont testées sur les 4 vectorisations précédentes :
- GradientBoostedTrees
- SVMWithSGD
- LogisticRegressionWithLBFGS
- NaiveBayes

sur les 4 vectorisations précédentes :
- HashingTF
- Word2Vec Corpus 1
- Word2Vec Corpus 2
- CountVectorizer

NB : sauf NaiveBayes qui n'est pas testé sur les Word2Vec



-  ####  <font color=blue>Gradient Boosting sur la vectorisation Hashing TF </font> 

In [1]:
//CONTEXTE DE TRAVAIL
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics

In [2]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_HTF")

//PARTAGE DE L'ECHANTILLON EN JEUX D'APPRENTISSAGE ET DE VALIDATION
val splits = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))
val (trainingData, validationData) = (splits(0), splits(1))

//CONSTRUCTION DU MODELE
val boostingStrategy = BoostingStrategy.defaultParams("Classification")
boostingStrategy.setNumIterations(20) //number of passes over our training data
boostingStrategy.treeStrategy.setNumClasses(2) //We have two output classes: happy and sad
boostingStrategy.treeStrategy.setMaxDepth(5)

//APPLICATION DU MODELE AU JEU D'APPRENTISSAGE
val modele = GradientBoostedTrees.train(trainingData, boostingStrategy)
// Sauvegarde du modèle
modele.save(sc, "modele/GBT_HTF")

//EVALUATION DU MODELE
//pour le jeu de validation
var labelAndPredsValid = validationData.map { point =>
  val prediction = modele.predict(point.features)
  Tuple2(point.label, prediction)
}


[Stage 282:>                                                        (0 + 4) / 4]

echantillon_LIBSVM = MapPartitionsRDD[6] at map at MLUtils.scala:84
splits = Array(MapPartitionsRDD[7] at randomSplit at <console>:37, MapPartitionsRDD[8] at randomSplit at <console>:37)
trainingData = MapPartitionsRDD[7] at randomSplit at <console>:37
validationData = MapPartitionsRDD[8] at randomSplit at <console>:37
boostingStrategy = BoostingStrategy(org.apache.spark.mllib.tree.configuration.Strategy@1aafffc7,org.apache.spark....


BoostingStrategy(org.apache.spark.mllib.tree.configuration.Strategy@1aafffc7,org.apache.spark.mllib.tree.loss.LogLoss$@38c697b2,20,0.1,0.001)

In [3]:
// CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_GBT_HTF = new BinaryClassificationMetrics(labelAndPredsValid)
val auPRC_GBT_HTF = 100 * (metrics_GBT_HTF.areaUnderPR() - metrics_GBT_HTF.areaUnderPR() % 0.0001) 
val auROC_GBT_HTF = 100 * (metrics_GBT_HTF.areaUnderROC() - metrics_GBT_HTF.areaUnderROC() % 0.0001) 

metrics_GBT_HTF = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@62b8daba
auPRC_GBT_HTF = 98.85000000000001
auROC_GBT_HTF = 91.45


91.45

-  ####  <font color=blue>Gradient Boosting sur la vectorisation Word2Vec Corpus 1 </font> 

In [4]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_Word2vec")

//PARTAGE DE L'ECHANTILLON EN JEUX D'APPRENTISSAGE ET DE VALIDATION
val splits = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))
val (trainingData, validationData) = (splits(0), splits(1))

//CONSTRUCTION DU MODELE

val boostingStrategy = BoostingStrategy.defaultParams("Classification")
boostingStrategy.setNumIterations(20) //number of passes over our training data
boostingStrategy.treeStrategy.setNumClasses(2) //We have two output classes: happy and sad
boostingStrategy.treeStrategy.setMaxDepth(5)

//APPLICATION DU MODELE AU JEU D'APPRENTISSAGE
val modele = GradientBoostedTrees.train(trainingData, boostingStrategy)
// Sauvegarde du modèle
modele.save(sc, "modele/GBT_W2V")

//EVALUATION DU MODELE

//pour le jeu de validation
var labelAndPredsValid = validationData.map { point =>
  val prediction = modele.predict(point.features)
  Tuple2(point.label, prediction)
}


[Stage 582:>                                                        (0 + 2) / 2]

echantillon_LIBSVM = MapPartitionsRDD[594] at map at MLUtils.scala:84
splits = Array(MapPartitionsRDD[595] at randomSplit at <console>:44, MapPartitionsRDD[596] at randomSplit at <console>:44)
trainingData = MapPartitionsRDD[595] at randomSplit at <console>:44
validationData = MapPartitionsRDD[596] at randomSplit at <console>:44
boostingStrategy = BoostingStrategy(org.apache.spark.mllib.tree.configuration.Strategy@5ae26811,org.apa...


BoostingStrategy(org.apache.spark.mllib.tree.configuration.Strategy@5ae26811,org.apache.spark.mllib.tree.loss.LogLoss$@38c697b2,20,0.1,0.001)

In [5]:
//CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_GBT_W2V = new BinaryClassificationMetrics(labelAndPredsValid)
val auPRC_GBT_W2V = 100 * (metrics_GBT_W2V.areaUnderPR() - metrics_GBT_W2V.areaUnderPR() % 0.0001) 
val auROC_GBT_W2V = 100 * (metrics_GBT_W2V.areaUnderROC() - metrics_GBT_W2V.areaUnderROC() % 0.0001) 

metrics_GBT_W2V = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@2cb445f9
auPRC_GBT_W2V = 97.72000000000001
auROC_GBT_W2V = 85.31


85.31

-  ####  <font color=blue>Gradient Boosting sur la vectorisation Word2Vec Corpus 2 </font> 

In [6]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_Word2vecC2")

//PARTAGE DE L'ECHANTILLON EN JEUX D'APPRENTISSAGE ET DE VALIDATION
val splits = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))
val (trainingData, validationData) = (splits(0), splits(1))

//CONSTRUCTION DU MODELE

val boostingStrategy = BoostingStrategy.defaultParams("Classification")
boostingStrategy.setNumIterations(20) //number of passes over our training data
boostingStrategy.treeStrategy.setNumClasses(2) //We have two output classes: happy and sad
boostingStrategy.treeStrategy.setMaxDepth(5)

//APPLICATION DU MODELE AU JEU D'APPRENTISSAGE
val modele = GradientBoostedTrees.train(trainingData, boostingStrategy)
// Sauvegarde du modèle
modele.save(sc, "modele/GBT_W2VC2")

//EVALUATION DU MODELE

//pour le jeu de validation
var labelAndPredsValid = validationData.map { point =>
  val prediction = modele.predict(point.features)
  Tuple2(point.label, prediction)
}

[Stage 894:>                                                        (0 + 2) / 2]

echantillon_LIBSVM = MapPartitionsRDD[1182] at map at MLUtils.scala:84
splits = Array(MapPartitionsRDD[1183] at randomSplit at <console>:44, MapPartitionsRDD[1184] at randomSplit at <console>:44)
trainingData = MapPartitionsRDD[1183] at randomSplit at <console>:44
validationData = MapPartitionsRDD[1184] at randomSplit at <console>:44
boostingStrategy = BoostingStrategy(org.apache.spark.mllib.tree.configuration.Strategy@c9d584c,org...


BoostingStrategy(org.apache.spark.mllib.tree.configuration.Strategy@c9d584c,org.apache.spark.mllib.tree.loss.LogLoss$@38c697b2,20,0.1,0.001)

In [7]:
//CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_GBT_W2V_C2 = new BinaryClassificationMetrics(labelAndPredsValid)
val auPRC_GBT_W2V_C2 = 100 * (metrics_GBT_W2V_C2.areaUnderPR() - metrics_GBT_W2V_C2.areaUnderPR() % 0.0001) 
val auROC_GBT_W2V_C2 = 100 * (metrics_GBT_W2V_C2.areaUnderROC() - metrics_GBT_W2V_C2.areaUnderROC() % 0.0001) 

metrics_GBT_W2V_C2 = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@339dcbd9
auPRC_GBT_W2V_C2 = 98.12
auROC_GBT_W2V_C2 = 89.42


89.42

-  ####  <font color=blue>Gradient Boosting sur la vectorisation CountVectorizer </font> 

In [8]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_Countvectorizer")

//PARTAGE DE L'ECHANTILLON EN JEUX D'APPRENTISSAGE ET DE VALIDATION
val splits = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))
val (trainingData, validationData) = (splits(0), splits(1))

//CONSTRUCTION DU MODELE

val boostingStrategy = BoostingStrategy.defaultParams("Classification")
boostingStrategy.setNumIterations(20) //number of passes over our training data
boostingStrategy.treeStrategy.setNumClasses(2) //We have two output classes: happy and sad
boostingStrategy.treeStrategy.setMaxDepth(5)

//APPLICATION DU MODELE AU JEU D'APPRENTISSAGE
val modele = GradientBoostedTrees.train(trainingData, boostingStrategy)
// Sauvegarde du modèle
modele.save(sc, "modele/GBT_CV")

//EVALUATION DU MODELE
//pour le jeu de validation
var labelAndPredsValid = validationData.map { point =>
  val prediction = modele.predict(point.features)
  Tuple2(point.label, prediction)
}




echantillon_LIBSVM = MapPartitionsRDD[1770] at map at MLUtils.scala:84
splits = Array(MapPartitionsRDD[1771] at randomSplit at <console>:44, MapPartitionsRDD[1772] at randomSplit at <console>:44)
trainingData = MapPartitionsRDD[1771] at randomSplit at <console>:44
validationData = MapPartitionsRDD[1772] at randomSplit at <console>:44
boostingStrategy = BoostingStrategy(org.apache.spark.mllib.tree.configuration.Strategy@30ffe51f,or...


BoostingStrategy(org.apache.spark.mllib.tree.configuration.Strategy@30ffe51f,org.apache.spark.mllib.tree.loss.LogLoss$@38c697b2,20,0.1,0.001)

In [9]:
//CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_GBT_CV = new BinaryClassificationMetrics(labelAndPredsValid)
val auPRC_GBT_CV = 100 * (metrics_GBT_CV.areaUnderPR() - metrics_GBT_CV.areaUnderPR() % 0.0001) 
val auROC_GBT_CV = 100 * (metrics_GBT_CV.areaUnderROC() - metrics_GBT_CV.areaUnderROC() % 0.0001) 

metrics_GBT_CV = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@ea06793
auPRC_GBT_CV = 98.91000000000001
auROC_GBT_CV = 92.08000000000001


92.08000000000001

-  ####  <font color=blue>SVM sur la vectorisation Hashing TF </font> 

In [10]:
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics

In [11]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_HTF")

//PARTAGE DE L'ECHANTILLON EN JEUX D'APPRENTISSAGE ET DE VALIDATION
val splits = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))
val (trainingData, validationData) = (splits(0), splits(1))

//APPLICATION DU MODELE AU JEU D'APPRENTISSAGE
val numIterations = 100
val model = SVMWithSGD.train(trainingData, numIterations)
// Sauvegarde du modèle
model.save(sc, "modele/SVM_HTF")

//EVALUATION DU MODELE
// Clear the default threshold.
model.clearThreshold()
// Prédiction sur le jeu de validation
val predictionAndLabels = validationData.map { case LabeledPoint(label, features) =>
  val prediction = model.predict(features)
  (prediction, label)
}


echantillon_LIBSVM = MapPartitionsRDD[2358] at map at MLUtils.scala:84
splits = Array(MapPartitionsRDD[2359] at randomSplit at <console>:47, MapPartitionsRDD[2360] at randomSplit at <console>:47)
trainingData = MapPartitionsRDD[2359] at randomSplit at <console>:47
validationData = MapPartitionsRDD[2360] at randomSplit at <console>:47
numIterations = 100
model = org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures ...


org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures = 12000, numClasses = 2, threshold = None

In [12]:
// CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_SVM_HTF = new BinaryClassificationMetrics(predictionAndLabels)
val auPRC_SVM_HTF = 100 * (metrics_SVM_HTF.areaUnderPR() - metrics_SVM_HTF.areaUnderPR() % 0.0001) 
val auROC_SVM_HTF = 100 * (metrics_SVM_HTF.areaUnderROC() - metrics_SVM_HTF.areaUnderROC() % 0.0001) 

metrics_SVM_HTF = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@57645e77
auPRC_SVM_HTF = 99.36
auROC_SVM_HTF = 97.18


97.18

-  ####  <font color=blue>SVM sur la vectorisation Word2Vec Corpus 1 </font> 

In [13]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_Word2vec")

//PARTAGE DE L'ECHANTILLON EN JEUX D'APPRENTISSAGE ET DE VALIDATION
val splits = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))
val (trainingData, validationData) = (splits(0), splits(1))

//APPLICATION DU MODELE AU JEU D'APPRENTISSAGE
val numIterations = 100
val model = SVMWithSGD.train(trainingData, numIterations)
// Sauvegarde du modèle
model.save(sc, "modele/SVM_W2V")
model.clearThreshold()
// Prédiction sur le jeu de validation
val predictionAndLabels = validationData.map { case LabeledPoint(label, features) =>
  val prediction = model.predict(features)
  (prediction, label)
}


[Stage 1384:>                                                       (0 + 2) / 2]

echantillon_LIBSVM = MapPartitionsRDD[2609] at map at MLUtils.scala:84
splits = Array(MapPartitionsRDD[2610] at randomSplit at <console>:49, MapPartitionsRDD[2611] at randomSplit at <console>:49)
trainingData = MapPartitionsRDD[2610] at randomSplit at <console>:49
validationData = MapPartitionsRDD[2611] at randomSplit at <console>:49
numIterations = 100
model = org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures ...


org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures = 100, numClasses = 2, threshold = None

In [14]:
//CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_SVM_W2V = new BinaryClassificationMetrics(predictionAndLabels)
val auPRC_SVM_W2V = 100 * (metrics_SVM_W2V.areaUnderPR() - metrics_SVM_W2V.areaUnderPR() % 0.0001) 
val auROC_SVM_W2V = 100 * (metrics_SVM_W2V.areaUnderROC() - metrics_SVM_W2V.areaUnderROC() % 0.0001) 

metrics_SVM_W2V = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@75c815e1
auPRC_SVM_W2V = 95.57
auROC_SVM_W2V = 82.77


82.77

-  ####  <font color=blue>SVM sur la vectorisation Word2Vec Corpus 2 </font> 

In [15]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_Word2vecC2")

//PARTAGE DE L'ECHANTILLON EN JEUX D'APPRENTISSAGE ET DE VALIDATION
val splits = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))
val (trainingData, validationData) = (splits(0), splits(1))

//APPLICATION DU MODELE AU JEU D'APPRENTISSAGE
val numIterations = 100
val model = SVMWithSGD.train(trainingData, numIterations)
// Sauvegarde du modèle
model.save(sc, "modele/SVM_W2VC2")

//EVALUATION DU MODELE
// Clear the default threshold.
model.clearThreshold()
// Prédiction sur le jeu de validation
val predictionAndLabels = validationData.map { case LabeledPoint(label, features) =>
  val prediction = model.predict(features)
  (prediction, label)
}


echantillon_LIBSVM = MapPartitionsRDD[2860] at map at MLUtils.scala:84
splits = Array(MapPartitionsRDD[2861] at randomSplit at <console>:49, MapPartitionsRDD[2862] at randomSplit at <console>:49)
trainingData = MapPartitionsRDD[2861] at randomSplit at <console>:49
validationData = MapPartitionsRDD[2862] at randomSplit at <console>:49
numIterations = 100
model = org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures ...


org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures = 100, numClasses = 2, threshold = None

In [16]:
// CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_SVM_W2V_C2 = new BinaryClassificationMetrics(predictionAndLabels)
val auPRC_SVM_W2V_C2 = 100 * (metrics_SVM_W2V_C2.areaUnderPR() - metrics_SVM_W2V_C2.areaUnderPR() % 0.0001) 
val auROC_SVM_W2V_C2 = 100 * (metrics_SVM_W2V_C2.areaUnderROC() - metrics_SVM_W2V_C2.areaUnderROC() % 0.0001) 

metrics_SVM_W2V_C2 = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@12ffbddc
auPRC_SVM_W2V_C2 = 99.14
auROC_SVM_W2V_C2 = 96.28000000000002


96.28000000000002

-  ####  <font color=blue>SVM sur la vectorisation CountVectorizer </font> 

In [17]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_Countvectorizer")

//PARTAGE DE L'ECHANTILLON EN JEUX D'APPRENTISSAGE ET DE VALIDATION
val splits = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))
val (trainingData, validationData) = (splits(0), splits(1))

//APPLICATION DU MODELE AU JEU D'APPRENTISSAGE
val numIterations = 100
val model = SVMWithSGD.train(trainingData, numIterations)

// Sauvegarde du modèle
model.save(sc, "modele/SVM_CV")

//EVALUATION DU MODELE
// Clear the default threshold.
model.clearThreshold()
// Prédiction sur le jeu de validation
val predictionAndLabels = validationData.map { case LabeledPoint(label, features) =>
  val prediction = model.predict(features)
  (prediction, label)
}

echantillon_LIBSVM = MapPartitionsRDD[3111] at map at MLUtils.scala:84
splits = Array(MapPartitionsRDD[3112] at randomSplit at <console>:49, MapPartitionsRDD[3113] at randomSplit at <console>:49)
trainingData = MapPartitionsRDD[3112] at randomSplit at <console>:49
validationData = MapPartitionsRDD[3113] at randomSplit at <console>:49
numIterations = 100
model = org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures ...


org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures = 3444, numClasses = 2, threshold = None

In [18]:
// CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_SVM_CV = new BinaryClassificationMetrics(predictionAndLabels)
val auPRC_SVM_CV = 100 * (metrics_SVM_CV.areaUnderPR() - metrics_SVM_CV.areaUnderPR() % 0.0001) 
val auROC_SVM_CV = 100 * (metrics_SVM_CV.areaUnderROC() - metrics_SVM_CV.areaUnderROC() % 0.0001) 

metrics_SVM_CV = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@2114ccb1
auPRC_SVM_CV = 99.5
auROC_SVM_CV = 97.87


97.87

 -  ####  <font color=blue>Régression logistique sur la vectorisation HashingTF </font> 

In [19]:
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.classification.{LogisticRegressionModel, LogisticRegressionWithLBFGS}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics

In [20]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_HTF")

//PARTAGE DE L'ECHANTILLON EN JEUX D'APPRENTISSAGE ET DE VALIDATION
val splits = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))
val (trainingData, validationData) = (splits(0), splits(1))

//CONSTRUCTION ET APPLICATION DU MODELE AU JEU D'APPRENTISSAGE
val model = new LogisticRegressionWithLBFGS()
  .setNumClasses(2)
  .run(trainingData)

// Sauvegarde du modèle
model.save(sc, "modele/LGB_HTF")

//APPLICATION DU MODELE AU JEU DE VALIDATION
val predictionAndLabels = validationData.map { case LabeledPoint(label, features) =>
  val prediction = model.predict(features)
  (prediction, label)
}

echantillon_LIBSVM = MapPartitionsRDD[3362] at map at MLUtils.scala:84
splits = Array(MapPartitionsRDD[3363] at randomSplit at <console>:54, MapPartitionsRDD[3364] at randomSplit at <console>:54)
trainingData = MapPartitionsRDD[3363] at randomSplit at <console>:54
validationData = MapPartitionsRDD[3364] at randomSplit at <console>:54
model = org.apache.spark.mllib.classification.LogisticRegressionModel: intercept = 0.0, numFeat...


org.apache.spark.mllib.classification.LogisticRegressionModel: intercept = 0.0, numFeatures = 12000, numClasses = 2, threshold = 0.5

In [21]:
//CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_LGB_HTF = new BinaryClassificationMetrics(predictionAndLabels)
val auPRC_LGB_HTF = 100 * (metrics_LGB_HTF.areaUnderPR() - metrics_LGB_HTF.areaUnderPR() % 0.0001) 
val auROC_LGB_HTF = 100 * (metrics_LGB_HTF.areaUnderROC() - metrics_LGB_HTF.areaUnderROC() % 0.0001) 

metrics_LGB_HTF = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@7547b107
auPRC_LGB_HTF = 97.46000000000001
auROC_LGB_HTF = 87.69


87.69

 -  ####  <font color=blue>Régression logistique sur la vectorisation Word2Vec Corpus 1 </font> 

In [22]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_Word2vec")

//PARTAGE DE L'ECHANTILLON EN JEUX D'APPRENTISSAGE ET DE VALIDATION
val splits = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))
val (trainingData, validationData) = (splits(0), splits(1))

//CONSTRUCTION ET APPLICATION DU MODELE AU JEU D'APPRENTISSAGE
val model = new LogisticRegressionWithLBFGS()
  .setNumClasses(2)
  .run(trainingData)

// Sauvegarde du modèle
model.save(sc, "modele/LGB_W2V")

//APPLICATION DU MODELE AU JEU DE VALIDATION
val predictionAndLabels = validationData.map { case LabeledPoint(label, features) =>
  val prediction = model.predict(features)
  (prediction, label)
}

[Stage 1873:>                                                       (0 + 2) / 2]

echantillon_LIBSVM = MapPartitionsRDD[3469] at map at MLUtils.scala:84
splits = Array(MapPartitionsRDD[3470] at randomSplit at <console>:53, MapPartitionsRDD[3471] at randomSplit at <console>:53)
trainingData = MapPartitionsRDD[3470] at randomSplit at <console>:53
validationData = MapPartitionsRDD[3471] at randomSplit at <console>:53
model = org.apache.spark.mllib.classification.LogisticRegressionModel: intercept = 0.0, numFeat...


org.apache.spark.mllib.classification.LogisticRegressionModel: intercept = 0.0, numFeatures = 100, numClasses = 2, threshold = 0.5

In [23]:
//CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_LGB_W2V = new BinaryClassificationMetrics(predictionAndLabels)
val auPRC_LGB_W2V = 100 * (metrics_LGB_W2V.areaUnderPR() - metrics_LGB_W2V.areaUnderPR() % 0.0001) 
val auROC_LGB_W2V = 100 * (metrics_LGB_W2V.areaUnderROC() - metrics_LGB_W2V.areaUnderROC() % 0.0001) 

metrics_LGB_W2V = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@54e52df6
auPRC_LGB_W2V = 97.51
auROC_LGB_W2V = 87.79


87.79

 -  ####  <font color=blue>Régression logistique sur la vectorisation Word2Vec Corpus 2 </font> 

In [24]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_Word2vecC2")

//PARTAGE DE L'ECHANTILLON EN JEUX D'APPRENTISSAGE ET DE VALIDATION
val splits = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))
val (trainingData, validationData) = (splits(0), splits(1))

//CONSTRUCTION ET APPLICATION DU MODELE AU JEU D'APPRENTISSAGE
val model = new LogisticRegressionWithLBFGS()
  .setNumClasses(2)
  .run(trainingData)

// Sauvegarde du modèle
model.save(sc, "modele/LGB_W2VC2")

//APPLICATION DU MODELE AU JEU DE VALIDATION
val predictionAndLabels = validationData.map { case LabeledPoint(label, features) =>
  val prediction = model.predict(features)
  (prediction, label)
}


echantillon_LIBSVM = MapPartitionsRDD[3618] at map at MLUtils.scala:84
splits = Array(MapPartitionsRDD[3619] at randomSplit at <console>:53, MapPartitionsRDD[3620] at randomSplit at <console>:53)
trainingData = MapPartitionsRDD[3619] at randomSplit at <console>:53
validationData = MapPartitionsRDD[3620] at randomSplit at <console>:53
model = org.apache.spark.mllib.classification.LogisticRegressionModel: intercept = 0.0, numFeat...


org.apache.spark.mllib.classification.LogisticRegressionModel: intercept = 0.0, numFeatures = 100, numClasses = 2, threshold = 0.5

In [25]:
//CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_LGB_W2V_C2 = new BinaryClassificationMetrics(predictionAndLabels)
val auPRC_LGB_W2V_C2 = 100 * (metrics_LGB_W2V_C2.areaUnderPR() - metrics_LGB_W2V_C2.areaUnderPR() % 0.0001) 
val auROC_LGB_W2V_C2 = 100 * (metrics_LGB_W2V_C2.areaUnderROC() - metrics_LGB_W2V_C2.areaUnderROC() % 0.0001) 

metrics_LGB_W2V_C2 = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@4ff30623
auPRC_LGB_W2V_C2 = 98.38
auROC_LGB_W2V_C2 = 91.95


91.95

 -  ####  <font color=blue>Régression logistique sur la vectorisation CountVectorizer </font> 

In [26]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_Countvectorizer")

//PARTAGE DE L'ECHANTILLON EN JEUX D'APPRENTISSAGE ET DE VALIDATION
val splits = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))
val (trainingData, validationData) = (splits(0), splits(1))

//CONSTRUCTION ET APPLICATION DU MODELE AU JEU D'APPRENTISSAGE
val model = new LogisticRegressionWithLBFGS()
  .setNumClasses(2)
  .run(trainingData)
// Sauvegarde du modèle
model.save(sc, "modele/LGB_CV")

//APPLICATION DU MODELE AU JEU DE VALIDATION
val predictionAndLabels = validationData.map { case LabeledPoint(label, features) =>
  val prediction = model.predict(features)
  (prediction, label)
}

echantillon_LIBSVM = MapPartitionsRDD[3752] at map at MLUtils.scala:84
splits = Array(MapPartitionsRDD[3753] at randomSplit at <console>:53, MapPartitionsRDD[3754] at randomSplit at <console>:53)
trainingData = MapPartitionsRDD[3753] at randomSplit at <console>:53
validationData = MapPartitionsRDD[3754] at randomSplit at <console>:53
model = org.apache.spark.mllib.classification.LogisticRegressionModel: intercept = 0.0, numFeat...


org.apache.spark.mllib.classification.LogisticRegressionModel: intercept = 0.0, numFeatures = 3444, numClasses = 2, threshold = 0.5

In [27]:
//CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_LGB_CV = new BinaryClassificationMetrics(predictionAndLabels)
val auPRC_LGB_CV = 100 * (metrics_LGB_CV.areaUnderPR() - metrics_LGB_CV.areaUnderPR() % 0.0001) 
val auROC_LGB_CV = 100 * (metrics_LGB_CV.areaUnderROC() - metrics_LGB_CV.areaUnderROC() % 0.0001) 

metrics_LGB_CV = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@49d5897d
auPRC_LGB_CV = 97.81
auROC_LGB_CV = 89.13000000000001


89.13000000000001

 -  ####  <font color=blue>Naïve Bayes sur la vectorisation HashingTF </font> 
 http://www.ijettcs.org/Volume6Issue5/IJETTCS-2017-08-31-7.pdf

In [28]:
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics


In [34]:

//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM :RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_HTF")

// Split data into training (70%) and test (30%).
val Array(trainingData, validationData) = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))

val model = NaiveBayes.train(trainingData, lambda = 1.0, modelType = "multinomial")

// Sauvegarde du modèle
model.save(sc, "modele/ANB_HTF")

//APPLICATION DU MODELE AU JEU DE VALIDATION
val predictionAndLabels = validationData.map { case LabeledPoint(label, features) =>
  val prediction = model.predict(features)
  (prediction, label)
}
//val predictionAndLabel = test.map(p => (modele.predict(p.features), p.label))

echantillon_LIBSVM = MapPartitionsRDD[3897] at map at MLUtils.scala:84
trainingData = MapPartitionsRDD[3898] at randomSplit at <console>:58
validationData = MapPartitionsRDD[3899] at randomSplit at <console>:58
model = org.apache.spark.mllib.classification.NaiveBayesModel@193002d5
predictionAndLabels = MapPartitionsRDD[3922] at map at <console>:66


MapPartitionsRDD[3922] at map at <console>:66

In [35]:
//CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_ANB_HTF = new BinaryClassificationMetrics(predictionAndLabels)
val auPRC_ANB_HTF = 100 * (metrics_ANB_HTF.areaUnderPR() - metrics_ANB_HTF.areaUnderPR() % 0.0001) 
val auROC_ANB_HTF = 100 * (metrics_ANB_HTF.areaUnderROC() - metrics_ANB_HTF.areaUnderROC() % 0.0001) 

metrics_ANB_HTF = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@27a6d794
auPRC_ANB_HTF = 98.55000000000001
auROC_ANB_HTF = 92.28


92.28

 -  ####  <font color=blue>Naïve Bayes sur la vectorisation CountVectorizer </font> 


In [36]:
//LECTURE D'UN FICHIER AU FORMAT LIBSVM
val echantillon_LIBSVM :RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "Data/Vecto_Countvectorizer")

// Split data into training (70%) and test (30%).
val Array(trainingData, validationData) = echantillon_LIBSVM.randomSplit(Array(0.7, 0.3))

val model = NaiveBayes.train(trainingData, lambda = 1.0, modelType = "multinomial")

// Sauvegarde du modèle
model.save(sc, "modele/ANB_CV")

//APPLICATION DU MODELE AU JEU DE VALIDATION
val predictionAndLabels = validationData.map { case LabeledPoint(label, features) =>
  val prediction = model.predict(features)
  (prediction, label)
}

echantillon_LIBSVM = MapPartitionsRDD[3959] at map at MLUtils.scala:84
trainingData = MapPartitionsRDD[3960] at randomSplit at <console>:57
validationData = MapPartitionsRDD[3961] at randomSplit at <console>:57
model = org.apache.spark.mllib.classification.NaiveBayesModel@71c12416
predictionAndLabels = MapPartitionsRDD[3984] at map at <console>:65


MapPartitionsRDD[3984] at map at <console>:65

In [37]:
//CALCUL DES INDICATEURS DE PERFORMANCE
val metrics_ANB_CV = new BinaryClassificationMetrics(predictionAndLabels)
val auPRC_ANB_CV = 100 * (metrics_ANB_CV.areaUnderPR() - metrics_ANB_CV.areaUnderPR() % 0.0001) 
val auROC_ANB_CV = 100 * (metrics_ANB_CV.areaUnderROC() - metrics_ANB_CV.areaUnderROC() % 0.0001) 

metrics_ANB_CV = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@489b7b6b
auPRC_ANB_CV = 98.70000000000002
auROC_ANB_CV = 93.15


93.15

 -  ####  <font color=blue>Synthèse des métriques </font> 

In [38]:
//Affichage des métriques
println("Modèle GRADIENT BOOSTING - Vectorisation HASHINGTF")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_GBT_HTF %")
println(s"Area under ROC = $auROC_GBT_HTF %")
println("Modèle GRADIENT BOOSTING - Vectorisation WORD2VEC sur corpus wikipédia")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_GBT_W2V %")
println(s"Area under ROC = $auROC_GBT_W2V %")
println("Modèle GRADIENT BOOSTING - Vectorisation WORD2VEC sur corpus commentaire")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_GBT_W2V_C2 %")
println(s"Area under ROC = $auROC_GBT_W2V_C2 %")
println("Modèle GRADIENT BOOSTING - Vectorisation COUNTVECTORIZER")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_GBT_CV %")
println(s"Area under ROC = $auROC_GBT_CV %")
println("Modèle SVM - vectorisation HASHINGTF")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_SVM_HTF %")
println(s"Area under ROC = $auROC_SVM_HTF %")
println("Modèle SVM - vectorisation WORD2VEC sur corpus wikipédia")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_SVM_W2V %")
println(s"Area under ROC = $auROC_SVM_W2V %")
println("Modèle SVM - vectorisation WORD2VEC sur corpus commentaire")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_SVM_W2V_C2 %")
println(s"Area under ROC = $auROC_SVM_W2V_C2 %")
println("Modèle SVM - vectorisation COUNTVECTORIZER")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_SVM_CV %")
println(s"Area under ROC = $auROC_SVM_CV %")
println("Modèle REGRESSION LOGISTIQUE - vectorisation HASHINGTF")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_LGB_HTF %")
println(s"Area under ROC = $auROC_LGB_HTF %")
println("Modèle REGRESSION LOGISTIQUE - vectorisation WORD2VEC sur corpus wikipédia")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_LGB_W2V %")
println(s"Area under ROC = $auROC_LGB_W2V %")
println("Modèle REGRESSION LOGISTIQUE - vectorisation WORD2VEC sur corpus commentaire")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_LGB_W2V_C2 %")
println(s"Area under ROC = $auROC_LGB_W2V_C2 %")
println("Modèle REGRESSION LOGISTIQUE - vectorisation COUNTVECTORIZER")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_LGB_CV %")
println(s"Area under ROC = $auROC_LGB_CV %")
println("Modèle NAIVE BAYES - vectorisation HASHINGTF")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_ANB_HTF %")
println(s"Area under ROC = $auROC_ANB_HTF %")
println("Modèle NAIVE BAYES - vectorisation COUNTVECTORIZER")
println("============================================================")
println(s"Area under precision-recall curve = $auPRC_ANB_CV %")
println(s"Area under ROC = $auROC_ANB_CV %")


Modèle GRADIENT BOOSTING - Vectorisation HASHINGTF
Area under precision-recall curve = 98.85000000000001 %
Area under ROC = 91.45 %
Modèle GRADIENT BOOSTING - Vectorisation WORD2VEC sur corpus wikipédia
Area under precision-recall curve = 97.72000000000001 %
Area under ROC = 85.31 %
Modèle GRADIENT BOOSTING - Vectorisation WORD2VEC sur corpus commentaire
Area under precision-recall curve = 98.12 %
Area under ROC = 89.42 %
Modèle GRADIENT BOOSTING - Vectorisation COUNTVECTORIZER
Area under precision-recall curve = 98.91000000000001 %
Area under ROC = 92.08000000000001 %
Modèle SVM - vectorisation HASHINGTF
Area under precision-recall curve = 99.36 %
Area under ROC = 97.18 %
Modèle SVM - vectorisation WORD2VEC sur corpus wikipédia
Area under precision-recall curve = 95.57 %
Area under ROC = 82.77 %
Modèle SVM - vectorisation WORD2VEC sur corpus commentaire
Area under precision-recall curve = 99.14 %
Area under ROC = 96.28000000000002 %
Modèle SVM - vectorisation COUNTVECTORIZER
Area unde