In [1]:
import sys.process._
%AddJar file:///home/user/Documents/RCP-216/myopennlp.jar

Starting download from file:///home/user/Documents/RCP-216/myopennlp.jar
Finished download of myopennlp.jar


In [2]:
//CONTEXTE DE TRAVAIL
import fllemmatizer.FLLemmatizer
import scala.collection.JavaConversions._
import scala.io.Source    

In [6]:

//LECTURE DU FICHIER D'ENTREE DE L'ECHANTILLON DE COMMENTAIRES EVALUES
val echantillon = spark.read.format("csv").option("header", "true").load("Data/echantillon_evalue.csv")

//CREATION DE LA TABLE ASSOCIEE A L'ECHANTILLON
echantillon.createOrReplaceTempView("echantillon")

//RE-STRUCTURATION DE L'ECHANTILLON D'APPRENTISSAGE POUR INTEGRER LA QUALITE DANS LE COMMENTAIRE
val commentaires_lus = spark.sql("select concat( qualite, '$ ', commentaire, ' ')  as msg from echantillon")

//CREATION DE LA TABLE DES COMMENTAIRES LUS
commentaires_lus.createOrReplaceTempView("commentaires_lus")

//LECTURE DU FICHIER D'ENTREE DES EVALUATIONS
val evaluationRDD = sc.textFile("Data/Evaluation_V4")

//LECTURE DU FICHIER DES EVALUATION PONDEREES
val evaluation_poids_brut = spark.read.option("delimiter", "#").csv("Data/Evaluation_V4").toDF("expression", "poids")

//REFORMATAGE DU CONTENU DE LA COLONNE "EXPRESSION" DU FICHIER DES EVALUATIONS PONDEREES
val evaluation_poids = evaluation_poids_brut.rdd.map(x => (x.getAs[String](0).replace("%","").replace(" ",""),x.getAs[String](1))).toDF("expression","poids")

//CREATION DE LA TABLE DES EVALUATIONS PONDEREES
evaluation_poids.createOrReplaceTempView("evaluation_poids")

//APPLICATION DU LEMMATISEUR FRANCAIS A CHAQUE MOT DE CHAQUE COMMENTAIRE DE L'ECHANTILLON, PUIS FILTRAGE
val evaluation_filtree :org.apache.spark.rdd.RDD[String] = evaluationRDD.mapPartitions(iter => {
                                        val lemmatizer = new FLLemmatizer("fr");
                                        iter.map{s => lemmatizer.lemmatize(s, true).filter(s =>s(1) == "NOUN"|| s(1) == "ADJ").map(s => s(2)).toString};
                                       })

//MISE AU FORMAT DATAFRAME APRES ELIMINATION DES CARACTERES INDESIRABLES
val evaluation_lemmatisee = evaluation_filtree.map(x => x.toString.replace("ArrayBuffer(","").replace(")","").replace(",","")).toDF("mot")
evaluation_lemmatisee.createOrReplaceTempView("evaluation_lemmatisee")
val mots_evaluation = spark.sql("select distinct mot from evaluation_lemmatisee where mot <> ''")
mots_evaluation.createOrReplaceTempView("Mots_evaluation")

//FILTRAGE DES COMMENTAIRES VIDES
val commentaires_filtres = spark.sql("select * from commentaires_lus where msg is not null")

//REFORMATAGE DES COMMENTAIRES LUS => TOUS LES CARACTERES SONT MIS EN LETTRES MINUSCULES
val commentaires = commentaires_filtres.rdd.map(x => x.getAs[String](0).toLowerCase).toDF("commentaire")

//DECOUPAGE DES COMMENTAIRES EN MOTS
val motsCommentaires = commentaires.select("commentaire").rdd.map(x => x.getAs[String](0)
                                                                        .replace("positfs$","")
                                                                        .replace("negatfs$","")
                                                                        .replace(",","")
                                                                        .replace(";",""))
                                                         .flatMap(line => line.split(" ")).toDS

//COMPTAGE DES MOTS DANS L'ENSEMBLE DES COMMENTAIRES
val motsOccurrences = motsCommentaires.groupByKey(identity).count().toDF("mot", "frequence")

//CREATION DE LA TABLE SQL DES MOTS
motsOccurrences.createOrReplaceTempView("Mots_echantillon")

//VISUALISATION DE LA TABLE DES MOTS
motsOccurrences.show(10)

//STATISTIQUES ELEMENTAIRES
motsOccurrences.select("frequence").describe().show

//LES 10 MOTS SIGNIFICATIFS LES PLUS COURANTS
spark.sql("select mot, frequence from Mots_echantillon where mot <> '' and length(mot) > 2 order by frequence desc").show(10)

//JOINTURE ENTRE ECHANTILLON ET EVALUATION
val comptage_mots = spark.sql("select a.mot, b.frequence from Mots_evaluation a inner join Mots_echantillon b on b.mot = a.mot where length(a.mot) > 3 and a.mot <> 'point' order by frequence desc")
comptage_mots.createOrReplaceTempView("comptage_mots")

//JOINTURE POUR RAJOUTER LA QUALITE
val comptage_mots_poids = spark.sql("select distinct a.frequence, a.mot, b.poids from comptage_mots a , evaluation_poids b where a.mot = b.expression order by a.frequence desc")
comptage_mots_poids.createOrReplaceTempView("comptage_mots_poids")

val comptage_mots_qualite = spark.sql("select cast(5*sqrt(frequence) as Int) as frequence, mot, case when poids > 0 then 1 else 0 end as qualite from comptage_mots_poids")

val comptage_mots_qualiteRDD = comptage_mots_qualite.rdd.map(r => r.toString).map(r => r.replace(","," "))

val nuage_mots = comptage_mots_qualiteRDD.toDF.rdd.map(x => x.getAs[String](0).replace("[","").replace("]","")).toDF

//SAUVEGARDE DE LA TABLE
nuage_mots.rdd.map(x => x.toString.replace("[","").replace("]","")).coalesce(1).saveAsTextFile("nuage_mots")

+---------+---------+
|      mot|frequence|
+---------+---------+
|     recu|        6|
|    chien|        5|
|      ces|      148|
| rehaussé|        1|
|  agencée|       11|
|correpond|        2|
|    those|        2|
|   voyage|      129|
|  sorties|       26|
| d'emblée|        3|
+---------+---------+
only showing top 10 rows

|summary|         frequence|
+-------+------------------+
|  count|             19353|
|   mean|26.163643879501887|
| stddev|322.48316214763656|
|    min|                 1|
|    max|             19042|
+-------+------------------+

+-----------+---------+                                                         
|        mot|frequence|
+-----------+---------+
|       très|    13886|
|        est|    10351|
|   positif$|     8640|
|       pour|     7157|
|       nous|     6912|
|       bien|     6734|
|        pas|     5249|
|        les|     5153|
|       dans|     4970|
|appartement|     4217|
+-----------+---------+
only showing top 10 rows

[INFO]: Load d

echantillon = [Id_comment: string, commentaire: string ... 1 more field]
commentaires_lus = [msg: string]
evaluationRDD = Data/Evaluation_V4 MapPartitionsRDD[39] at textFile at <console>:61
evaluation_poids_brut = [expression: string, poids: string]
evaluation_poids = [expression: string, poids: string]
evaluation_filtree = MapPartitionsRDD[53] at mapPartitions at <console>:73
evaluation_lemmatisee = [mot: string]
mots_evaluation = [mot: string]
commentaires_filtres = [msg: string]


lastException: Throwable = null
commentaires: org....


[msg: string]