In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --driver-memory 3g pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Sergey Grishaev clustering app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
spark

![kmeans_algo](pics/kmeans_algo.png)

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [5]:
schema = StructType([
    StructField("id", StringType()),
    StructField("comment_text", StringType()),
    StructField("toxic", IntegerType()),
    StructField("severe_toxic", IntegerType()),
    StructField("obscene", IntegerType()),
    StructField("threat", IntegerType()),
    StructField("insult", IntegerType()),
    StructField("identity_hate", IntegerType())
])

In [6]:
dataset = spark.read.csv("/lectures/lecture03/data/train.csv", schema=schema, header=True, multiLine=True, escape='"')

In [7]:
dataset = dataset.repartition(15).cache()

In [8]:
dataset

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int]

In [9]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer

In [10]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")

In [11]:
StopWordsRemover.loadDefaultStopWords("russian")

['и',
 'в',
 'во',
 'не',
 'что',
 'он',
 'на',
 'я',
 'с',
 'со',
 'как',
 'а',
 'то',
 'все',
 'она',
 'так',
 'его',
 'но',
 'да',
 'ты',
 'к',
 'у',
 'же',
 'вы',
 'за',
 'бы',
 'по',
 'только',
 'ее',
 'мне',
 'было',
 'вот',
 'от',
 'меня',
 'еще',
 'нет',
 'о',
 'из',
 'ему',
 'теперь',
 'когда',
 'даже',
 'ну',
 'вдруг',
 'ли',
 'если',
 'уже',
 'или',
 'ни',
 'быть',
 'был',
 'него',
 'до',
 'вас',
 'нибудь',
 'опять',
 'уж',
 'вам',
 'ведь',
 'там',
 'потом',
 'себя',
 'ничего',
 'ей',
 'может',
 'они',
 'тут',
 'где',
 'есть',
 'надо',
 'ней',
 'для',
 'мы',
 'тебя',
 'их',
 'чем',
 'была',
 'сам',
 'чтоб',
 'без',
 'будто',
 'чего',
 'раз',
 'тоже',
 'себе',
 'под',
 'будет',
 'ж',
 'тогда',
 'кто',
 'этот',
 'того',
 'потому',
 'этого',
 'какой',
 'совсем',
 'ним',
 'здесь',
 'этом',
 'один',
 'почти',
 'мой',
 'тем',
 'чтобы',
 'нее',
 'сейчас',
 'были',
 'куда',
 'зачем',
 'всех',
 'никогда',
 'можно',
 'при',
 'наконец',
 'два',
 'об',
 'другой',
 'хоть',
 'после',
 'на

In [12]:
stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [13]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [14]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)

In [15]:
count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vector", vocabSize=20000)

In [16]:
from pyspark.ml import Pipeline

In [17]:
preprocessing = Pipeline(stages=[
    tokenizer,
    swr,
    count_vectorizer
])

In [18]:
preprocessing_model = preprocessing.fit(dataset)

In [23]:
preprocessed_dataset = preprocessing_model.transform(dataset).cache()

In [24]:
preprocessed_dataset.count()

159571

In [20]:
preprocessed_dataset.select(["word_vector"]).show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [21]:
dataset

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int]

In [25]:
from pyspark.ml.clustering import KMeans

In [26]:
kmeans = KMeans(featuresCol="word_vector", k=7, seed=5757)

In [27]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [28]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [29]:
clustering[clustering.columns[2:8] + ["prediction"]].take(20)

[Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=1, severe_toxic=0, obscene=0, threat=0, insult=0, identity_

### Silhouette score

https://en.wikipedia.org/wiki/Silhouette_(clustering)

In [30]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [31]:
evaluator = ClusteringEvaluator(featuresCol="word_vector")

In [32]:
evaluator.evaluate(clustering)

0.89991908171493

In [33]:
clustering.filter(clustering.prediction == 1)[["comment_text"]].show(5, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [34]:
kmeans = KMeans(featuresCol="word_vector", k=2, seed=1234)

In [35]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [36]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [37]:
evaluator.evaluate(clustering)

0.9986016553486116

In [38]:
kmeans_model.clusterCenters()

[array([2.99315485e+00, 5.11825436e-01, 2.44501000e-01, ...,
        1.37905961e-04, 1.37905961e-04, 1.37905961e-04]),
 array([5.11500000e+02, 4.52380952e-01, 2.61904762e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00])]

In [39]:
import numpy as np

In [40]:
kmeans_model.clusterCenters()[1]

array([5.11500000e+02, 4.52380952e-01, 2.61904762e-01, ...,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [41]:
np.argsort(kmeans_model.clusterCenters()[1])

array([ 9999, 13272, 13271, ...,  2349,  1108,     0])

In [42]:
preprocessing_model.stages[2].vocabulary

['',
 '"',
 'article',
 'page',
 'please',
 'like',
 'one',
 '-',
 'wikipedia',
 'talk',
 'think',
 'see',
 'also',
 'know',
 'may',
 'edit',
 'people',
 'use',
 'get',
 'even',
 'make',
 'articles',
 'good',
 'want',
 'time',
 'it.',
 'need',
 'new',
 'thank',
 'go',
 'first',
 'information',
 'many',
 'made',
 'find',
 'page.',
 'name',
 'really',
 'thanks',
 'say',
 'fuck',
 'much',
 'used',
 'since',
 'article.',
 'user',
 'add',
 'way',
 'take',
 'help',
 'sources',
 'look',
 'someone',
 'still',
 'read',
 'section',
 'pages',
 'going',
 'two',
 'deletion',
 'you.',
 'source',
 'edits',
 'without',
 'discussion',
 'well',
 'editing',
 'wikipedia.',
 'point',
 'deleted',
 'back',
 'might',
 'work',
 'something',
 'image',
 'another',
 'added',
 'never',
 'put',
 'link',
 'seems',
 'stop',
 ',',
 'blocked',
 'feel',
 '.',
 'list',
 'block',
 'right',
 'said',
 '(utc)',
 'using',
 'ask',
 'personal',
 'fact',
 'sure',
 'article,',
 'believe',
 'hope',
 'page,',
 'note',
 'actually',


In [43]:
for i in np.argsort(kmeans_model.clusterCenters()[1])[:40]:
    print(preprocessing_model.stages[2].vocabulary[i])

cared
o'reilly
sneak
(mostly
blew
difficult,
south,
do)
clairsentience
banners
marginal
critics,
allemande
datestamp
jealouslyfavonian
implicit
colloquial
objections.
commited
fight,
pounds
ritual
wp:rm
repetition
1973
one),
(meaning
ossetia
int
btw:
(album)
contra
tours
calculate
{
situated
posting.
youve
edit-war
arises


In [44]:
for i in np.argsort(kmeans_model.clusterCenters()[1])[-40:]:
    print(preprocessing_model.stages[2].vocabulary[i])

wireless
tx
love
one
keller
frm
(talk)
=
name:
helen
],
joke:
[
first
page
-
\
politcal
piggy
!!!
o
you?
fggt!
know
|
/
egg
bad
fack
wanker
sexual.
homo
all!!
fuck
rape
anal
smells
nikko
tacos



In [45]:
for i in np.argsort(kmeans_model.clusterCenters()[0])[:40]:
    print(preprocessing_model.stages[2].vocabulary[i])

nikko
sexual.
tacos
all!!
],
joke:
frm
keller
politcal
fack
piggy
(*)
tx
(list
payload
ana
ecology
discern
volcano
seriousness
ones)
comb
ti
devotion
proposals,
bengali
heating
optics
abstracts
fuckk
mixes
overall.
ethos
congregation
fish.
sufficient,
more...
anti-romanian
willful
metacritic


In [46]:
for i in np.argsort(kmeans_model.clusterCenters()[0])[-40:]:
    print(preprocessing_model.stages[2].vocabulary[i])

say
thanks
really
name
page.
find
made
many
information
first
go
thank
new
need
it.
time
want
good
articles
make
even
get
use
people
edit
may
know
also
see
think
talk
wikipedia
-
one
like
please
page
article
"



## The curse of dimensionality
![curse](pics/dimensionality_vs_performance.png)

## Why is that?
![curse](pics/curseofdimensionality.png)

## LDA

![curse](pics/lda.png)

In [47]:
from pyspark.ml.clustering import LDA

In [48]:
lda = LDA(featuresCol="word_vector", seed=5757, k=7)

In [49]:
lda_model = lda.fit(preprocessed_dataset)

In [50]:
topics = lda_model.transform(preprocessed_dataset)

In [51]:
topics.show(5, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [52]:
lda_model.vocabSize()

20000

In [53]:
lda_model.describeTopics(maxTermsPerTopic=10).collect()

[Row(topic=0, termIndices=[1028, 854, 1201, 1291, 1867, 1389, 0, 432, 3565, 2816], termWeights=[0.019219863339050642, 0.018725111439243117, 0.012679219108185966, 0.00983833160540672, 0.009437891390795969, 0.00895307155202717, 0.008567983306776229, 0.0075119548419698276, 0.007457446016544144, 0.007432205338910132]),
 Row(topic=1, termIndices=[40, 0, 257, 175, 999, 1108, 211, 249, 359, 29], termWeights=[0.04652685568271289, 0.034283481266029384, 0.02974470035001793, 0.023989101798392188, 0.020459016500430277, 0.01575874959959341, 0.014971631085737463, 0.013272138959553006, 0.01280778430966381, 0.010512178289035171]),
 Row(topic=2, termIndices=[0, 1, 2, 8, 5, 6, 9, 3, 10, 16], termWeights=[0.06179727225345281, 0.0129727671293497, 0.00595669106029487, 0.005662956727473026, 0.005531537524141662, 0.00544254693200636, 0.005207971527863071, 0.005017822700234205, 0.004628387213555436, 0.004585262896629671]),
 Row(topic=3, termIndices=[0, 4, 59, 2, 3, 1, 121, 14, 69, 9], termWeights=[0.064322614

In [54]:
for i in [0, 4, 59, 2, 1, 121, 3, 14, 69, 9]:
    print(preprocessing_model.stages[-1].vocabulary[i])


please
deletion
article
"
speedy
page
may
deleted
talk


## Clustering is a good dimensionality reduction technique

In [55]:
topics

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int, words: array<string>, words_filtered: array<string>, word_vector: vector, topicDistribution: vector]

In [56]:
from pyspark.sql import functions as f

In [57]:
target = f.when(
    (topics.toxic == 0) &
    (topics.severe_toxic == 0) &
    (topics.obscene == 0) &
    (topics.threat == 0) &
    (topics.insult == 0) &
    (topics.identity_hate == 0),
    0
).otherwise(1)

In [58]:
new_dataset = topics.withColumn("target", target)[["id", "target", "topicDistribution"]].cache()

In [59]:
new_dataset.take(5)

[Row(id='bff59f526e2fe3ee', target=0, topicDistribution=DenseVector([0.0111, 0.0114, 0.2666, 0.0118, 0.4275, 0.2604, 0.0112])),
 Row(id='01c0ae884d69319b', target=0, topicDistribution=DenseVector([0.016, 0.0164, 0.0234, 0.017, 0.0167, 0.8945, 0.016])),
 Row(id='e59524f461645a8e', target=0, topicDistribution=DenseVector([0.0005, 0.0005, 0.9533, 0.0442, 0.0005, 0.0007, 0.0005])),
 Row(id='236d6812f2499156', target=0, topicDistribution=DenseVector([0.002, 0.0021, 0.0029, 0.0021, 0.2113, 0.7776, 0.002])),
 Row(id='a66136914c57fd85', target=0, topicDistribution=DenseVector([0.0028, 0.0029, 0.9814, 0.003, 0.003, 0.0041, 0.0028]))]

In [60]:
from pyspark.ml.classification import LogisticRegression

In [61]:
lr = LogisticRegression(featuresCol="topicDistribution", labelCol="target")

In [62]:
train = new_dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}, seed=5757).cache()

In [63]:
test = new_dataset.join(train, on="id", how="leftanti").coalesce(15).cache()

In [64]:
train.count()
test.count()

31337

In [65]:
lr_model = lr.fit(train)

In [66]:
predictions = lr_model.transform(test)

In [67]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [68]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="target", metricName='areaUnderROC')

In [69]:
evaluator.evaluate(predictions)

0.8443676064034972

In [None]:
spark.stop()