In [1]:
import org.apache.spark._
import org.apache.spark.rdd._
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
import org.apache.spark.mllib.linalg.{Vector, Vectors, DenseVector, SparseVector}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.util.KMeansDataGenerator

In [2]:
import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
import org.apache.spark.mllib.linalg.{Vector => SparkVector}
def toBreeze(v:SparkVector) = BV(v.toArray)
def fromBreeze(bv:BV[Double]) = Vectors.dense(bv.toArray)
def add(v1:SparkVector, v2:SparkVector) = fromBreeze(toBreeze(v1) + toBreeze(v2))
def scalarMultiply(a:Double, v:SparkVector) = fromBreeze(a * toBreeze(v))

import breeze.linalg.{DenseVector=>BDV, SparseVector=>BSV, Vector=>BV}
import org.apache.spark.mllib.linalg.{Vector=>SparkVector}
toBreeze: (v: org.apache.spark.mllib.linalg.Vector)breeze.linalg.Vector[Double]
fromBreeze: (bv: breeze.linalg.Vector[Double])org.apache.spark.mllib.linalg.Vector
add: (v1: org.apache.spark.mllib.linalg.Vector, v2: org.apache.spark.mllib.linalg.Vector)org.apache.spark.mllib.linalg.Vector
scalarMultiply: (a: Double, v: org.apache.spark.mllib.linalg.Vector)org.apache.spark.mllib.linalg.Vector


In [3]:
// lire les stop words
import scala.io.Source
val stopWords = Source.fromFile("/home/user/Documents/UASB03/AirBnBStopwords-fr.txt").getLines.toSet

// transmettre les stop words aux noeuds de calcul
val bStopWords = sc.broadcast(stopWords)

// lire le Word2VecModel
val w2vModel = Word2VecModel.load(sc, "modele/Word2VecFR")

// obtenir une Map[String, Array[Float]] sérializable
//   mapValues seul ne retourne pas une map sérializable (SI-7005)
val vectors = w2vModel.getVectors.mapValues(vv => Vectors.dense(vv.map(_.toDouble))).map(identity)

// transmettre la map aux noeuds de calcul
val bVectors = sc.broadcast(vectors)



stopWords = Set(pres, afin, est, duquel, celui-là, dos, precisement, e, se, seras, combien, differents, tiens, le, tellement, aient, soixante, des, directe, nommés, desquels, s, hou, quatre, tout, ceux-ci, encore, chez, unes, aurons, peux, premier, relative, votre, auras, quant, bigre, x, onze, laquelle, ollé, merci, eusse, toute, du, fais, vous-mêmes, entre, sapristi, dix, avec, serez, était, cinquantième, moi, peuvent, ta, tien, parfois, cher, fut, vôtres, suis, nôtres, tel, remarquable, mon, clic, bat, ho, parmi, dans, dernier, dès, oh, différentes, hein, force, lès, soient, façon, neanmoins, clac, celles, allons, reste, hem, parce, treize, êtes, ayant, juste, nouveaux, sois, toujours, olé, tardive, ouste, vé, faites, eu,...


Set(pres, afin, est, duquel, celui-là, dos, precisement, e, se, seras, combien, differents, tiens, le, tellement, aient, soixante, des, directe, nommés, desquels, s, hou, quatre, tout, ceux-ci, encore, chez, unes, aurons, peux, premier, relative, votre, auras, quant, bigre, x, onze, laquelle, ollé, merci, eusse, toute, du, fais, vous-mêmes, entre, sapristi, dix, avec, serez, était, cinquantième, moi, peuvent, ta, tien, parfois, cher, fut, vôtres, suis, nôtres, tel, remarquable, mon, clic, bat, ho, parmi, dans, dernier, dès, oh, différentes, hein, force, lès, soient, façon, neanmoins, clac, celles, allons, reste, hem, parce, treize, êtes, ayant, juste, nouveaux, sois, toujours, olé, tardive, ouste, vé, faites, eu, relativement, comparables, pièce, quelque, dit, floc, pff, n, dits, vos, importe, eus, exactement, tu, cinq, notamment, delà, eûmes, voient, quels, celui, qui, suivante, celà, jusque, restrictif, celle, mine, seraient, cette, ça, toutefois, autre, mes, different, diverses, toi

In [4]:
// taille par défaut des vecteurs Word2Vec
val vectSize = 100

// lecture du fichier de tweets dans un RDD (item = ligne)
val sentences = sc.textFile("/home/user/Documents/UASB03/comment_fr_filtre.csv")

// calcul des représentations Word2Vec des tweets
val sent2vec = sentences.filter(sentence => sentence.length >= 1)
    .map(sentence => sentence.toLowerCase.split("\\W+"))
    .map(wordSeq => {
        var vSum = Vectors.zeros(vectSize)
        var vNb = 0
        wordSeq.foreach { word =>
            if(!(bStopWords.value)(word) & (word.length >= 2)) {
                bVectors.value.get(word).foreach { v =>
                    vSum = add(v, vSum)
                    vNb += 1
                }
            }
        }
        if (vNb != 0) {
            vSum = scalarMultiply(1.0 / vNb, vSum)
        }
        vSum
    }).filter(vec => Vectors.norm(vec, 1.0) > 0.0).persist()

vectSize = 100
sentences = /home/user/Documents/UASB03/comment_fr_filtre.csv MapPartitionsRDD[8] at textFile at <console>:58
sent2vec = MapPartitionsRDD[12] at filter at <console>:78


MapPartitionsRDD[12] at filter at <console>:78

In [5]:
sent2vec.count()

[Stage 3:>                                                          (0 + 2) / 2]

341575

In [6]:
sent2vec.take(2)

[[-0.10803462937474251,-0.14129161006874508,-0.006262904653946558,-0.08404038701620366,-0.05322896171775129,0.1742613274190161,-0.024228291172120307,-0.06793791893869638,-0.03040544000557727,-0.14900542309300766,0.2211357206106186,0.10082477786474757,-0.07490626650138034,0.0561821475211117,0.030181712988350123,0.1702256882821934,0.11139468455480205,0.035148729363249406,0.04694235697388649,0.16088787135150698,-0.10154435968272284,-0.10311198253960659,0.019933202924827732,0.0144196022560613,-0.020825731257597603,-0.010888704823123084,0.1343119005776114,0.17172166736175615,0.08000474216209517,0.11267615254554483,-0.14040807634592056,0.07251732092764641,-0.031084637675020427,-0.03833507539497481,0.13601658182839552,0.015134393134050898,0.06552158637593189,-0.04657607649763425,0.004045514182911979,-0.12908230142460927,0.09264796930882666,0.08443950046785176,-0.005131173309766584,-0.07981973265608151,-0.0939930412504408,0.031460326371921435,0.02567866216931078,0.08713952451944351,-0.09770151

In [7]:
val nbClusters = 20
val nbIterations = 200
val clustering = KMeans.train(sent2vec, nbClusters, nbIterations)

[Stage 225:>                                                        (0 + 2) / 2]

nbClusters = 20
nbIterations = 200
clustering = org.apache.spark.mllib.clustering.KMeansModel@474b5691


org.apache.spark.mllib.clustering.KMeansModel@474b5691

In [8]:
clustering.clusterCenters.foreach(clusterCenter => {
    w2vModel.findSynonyms(clusterCenter,5).foreach(synonym => print(" %s (%5.3f),"
            .format(synonym._1, synonym._2)))
    println()
})

 metro (0,687), o’hare (0,571), zubiarte (0,558), déraille (0,555), smartbike (0,554),
 isabelle (0,890), monique (0,884), sophie (0,883), hélène (0,882), catherine (0,881),
 barillerie (0,724), restaurant (0,714), quartier (0,709), promenade (0,707), herbouville (0,697),
 strive (0,946), tūmatauenga (0,943), honesty (0,942), often (0,939), took (0,938),
 envolé (0,660), plaisanter (0,641), souriante (0,641), nonchalant (0,637), rondement (0,633),
 parfait (0,999), tourment (0,645), adorable (0,642), bavard (0,639), rêvé (0,625),
 repense (0,841), déshabillée (0,833), bourly (0,832), culpabiliser (0,825), minuteur (0,813),
 recommande (0,979), réaffirme (0,764), l’immunité (0,763), effectivité (0,759), constitutionnellement (0,732),
 super (0,954), géant (0,666), saiyan (0,660), ninja (0,635), petz (0,608),
 tres (0,912), vientos (0,736), concierto (0,719), amor (0,710), palabras (0,708),
 balsam (0,889), mckillip (0,883), begley (0,879), tallulah (0,870), mantovani (0,866),
 ashikabi 