In [1]:
import org.apache.spark._
import org.apache.spark.rdd._
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
import org.apache.spark.mllib.linalg.{Vector, Vectors, DenseVector, SparseVector}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.util.KMeansDataGenerator

In [2]:
import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
import org.apache.spark.mllib.linalg.{Vector => SparkVector}
def toBreeze(v:SparkVector) = BV(v.toArray)
def fromBreeze(bv:BV[Double]) = Vectors.dense(bv.toArray)
def add(v1:SparkVector, v2:SparkVector) = fromBreeze(toBreeze(v1) + toBreeze(v2))
def scalarMultiply(a:Double, v:SparkVector) = fromBreeze(a * toBreeze(v))

import breeze.linalg.{DenseVector=>BDV, SparseVector=>BSV, Vector=>BV}
import org.apache.spark.mllib.linalg.{Vector=>SparkVector}
toBreeze: (v: org.apache.spark.mllib.linalg.Vector)breeze.linalg.Vector[Double]
fromBreeze: (bv: breeze.linalg.Vector[Double])org.apache.spark.mllib.linalg.Vector
add: (v1: org.apache.spark.mllib.linalg.Vector, v2: org.apache.spark.mllib.linalg.Vector)org.apache.spark.mllib.linalg.Vector
scalarMultiply: (a: Double, v: org.apache.spark.mllib.linalg.Vector)org.apache.spark.mllib.linalg.Vector


In [3]:
// lire les stop words
import scala.io.Source
val stopWords = Source.fromFile("/home/user/Documents/RCP-216/data/stop_words").getLines.toSet

// transmettre les stop words aux noeuds de calcul
val bStopWords = sc.broadcast(stopWords)

// lire le Word2VecModel
val w2vModel = Word2VecModel.load(sc, "/home/user/Documents/RCP-216/data/w2vModel")

// obtenir une Map[String, Array[Float]] sérializable
//   mapValues seul ne retourne pas une map sérializable (SI-7005)
val vectors = w2vModel.getVectors.mapValues(vv => Vectors.dense(vv.map(_.toDouble))).map(identity)

// transmettre la map aux noeuds de calcul
val bVectors = sc.broadcast(vectors)



stopWords = Set(serious, latterly, down, side, moreover, please, ourselves, behind, for, find, further, mill, due, any, wherein, across, twenty, name, this, in, move, myself, have, your, off, once, are, is, his, why, too, among, everyone, show, empty, already, nobody, less, am, hence, system, than, four, fire, anyhow, three, whereby, himself, con, twelve, throughout, but, whether, below, co, mine, becomes, eleven, what, would, although, elsewhere, another, front, if, hereby, own, neither, bottom, up, etc, so, our, per, therein, must, beforehand, keep, do, all, him, had, somehow, re, onto, nor, every, herein, full, before, afterwards, somewhere, whither, else, namely, us, it, whereupon, two, thence, a, sometimes, became, thou...


Set(serious, latterly, down, side, moreover, please, ourselves, behind, for, find, further, mill, due, any, wherein, across, twenty, name, this, in, move, myself, have, your, off, once, are, is, his, why, too, among, everyone, show, empty, already, nobody, less, am, hence, system, than, four, fire, anyhow, three, whereby, himself, con, twelve, throughout, but, whether, below, co, mine, becomes, eleven, what, would, although, elsewhere, another, front, if, hereby, own, neither, bottom, up, etc, so, our, per, therein, must, beforehand, keep, do, all, him, had, somehow, re, onto, nor, every, herein, full, before, afterwards, somewhere, whither, else, namely, us, it, whereupon, two, thence, a, sometimes, became, though, within, as, because, well, meanwhile, has, she, yours, whose, yet, or, seems, describe, above, yourself, computer, herself, others, such, they, each, last, de, formerly, i, until, whatever, that, out, whenever, whereafter, amount, cannot, upon, to, become, sometime, least, 

In [4]:
// taille par défaut des vecteurs Word2Vec
val vectSize = 100

// lecture du fichier de tweets dans un RDD (item = ligne)
val sentences = sc.textFile("/home/user/Documents/RCP-216/data/tweets")

// calcul des représentations Word2Vec des tweets
val sent2vec = sentences.filter(sentence => sentence.length >= 1)
    .map(sentence => sentence.toLowerCase.split("\\W+"))
    .map(wordSeq => {
        var vSum = Vectors.zeros(vectSize)
        var vNb = 0
        wordSeq.foreach { word =>
            if(!(bStopWords.value)(word) & (word.length >= 2)) {
                bVectors.value.get(word).foreach { v =>
                    vSum = add(v, vSum)
                    vNb += 1
                }
            }
        }
        if (vNb != 0) {
            vSum = scalarMultiply(1.0 / vNb, vSum)
        }
        vSum
    }).filter(vec => Vectors.norm(vec, 1.0) > 0.0).persist()

vectSize = 100
sentences = /home/user/Documents/RCP-216/data/tweets MapPartitionsRDD[8] at textFile at <console>:58
sent2vec = MapPartitionsRDD[12] at filter at <console>:78


MapPartitionsRDD[12] at filter at <console>:78

In [5]:
sent2vec.count()

[Stage 3:>                                                          (0 + 2) / 2]

739

In [7]:
sent2vec.take(2)

[[-0.027988523244857788,-0.07431866228580475,0.09145046770572662,-0.01157666090875864,0.03929232805967331,-0.05890924111008644,0.08137806504964828,0.12244899570941925,0.026949968189001083,0.10132735967636108,0.029005125164985657,0.021411007270216942,-0.004389403387904167,-0.041520968079566956,0.02486402727663517,-0.04098132997751236,0.05608462542295456,-0.045475587248802185,-0.13348358869552612,-0.04931315779685974,0.008751633577048779,0.0360114760696888,-0.0059855664148926735,0.02956448681652546,0.08999983221292496,-0.13633666932582855,-0.05564919114112854,-0.0076880375854671,-0.04098349064588547,0.03392435610294342,0.20209497213363647,0.050972163677215576,-0.04521965980529785,0.0523226223886013,0.09931804984807968,-0.002539577428251505,7.794810517225415E-5,-0.06215415894985199,-0.13437716662883759,0.08244974911212921,-0.09550885111093521,-0.04567639157176018,0.06954824179410934,0.02887226641178131,0.06212705373764038,-0.027039499953389168,-0.030935458838939667,0.08454225212335587,-0.

In [12]:
val nbClusters = 20
val nbIterations = 200
val clustering = KMeans.train(sent2vec, nbClusters, nbIterations)

[Stage 144:>                                                        (0 + 2) / 2]

nbClusters = 20
nbIterations = 200
clustering = org.apache.spark.mllib.clustering.KMeansModel@320b8539


org.apache.spark.mllib.clustering.KMeansModel@320b8539

In [13]:
clustering.clusterCenters.foreach(clusterCenter => {
    w2vModel.findSynonyms(clusterCenter,5).foreach(synonym => print(" %s (%5.3f),"
            .format(synonym._1, synonym._2)))
    println()
})

 weather (0,819), transmissions (0,635), updates (0,626), servicing (0,605), geostationary (0,591),
 thank (0,889), yes (0,843), sorry (0,787), tidak (0,744), vai (0,743),
 right (0,931), left (0,872), handed (0,701), frac (0,653), vec (0,614),
 day (0,854), week (0,742), days (0,721), year (0,719), month (0,691),
 night (1,000), saturday (0,732), morning (0,674), grinch (0,621), monday (0,606),
 oh (0,968), ch (0,676), cooh (0,633), ik (0,622), tib (0,592),
 joanie (0,802), peck (0,801), fisher (0,796), heather (0,793), bucks (0,792),
 teen (0,752), tonight (0,751), bullshit (0,745), bowie (0,731), ziggy (0,725),
 https (0,917), edu (0,793), php (0,791), howstuffworks (0,789), download (0,784),
 wish (0,839), want (0,805), feel (0,801), despise (0,790), presume (0,787),
 hackney (0,686), webcam (0,677), yazoo (0,668), listings (0,655), bemani (0,654),
 https (1,000), edu (0,864), geocities (0,862), adb (0,821), ibiblio (0,813),
 unrestricted (0,753), incentives (0,745), transfers (0,7