In [160]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))
os.environ["PYSPARK_PYTHON"] = 'python3'
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0
      /_/

Using Python version 3.6.4 (default, Jan 28 2018 00:00:00)
SparkSession available as 'spark'.


![kmeans](pics/kmeans.svg)

![kmeans_algo](pics/kmeans_algo.png)

In [162]:
spark

In [163]:
from pyspark.sql.types import *

In [164]:
import pandas as pd

In [165]:
df = pd.read_csv("/data/home/pavel.klemenkov/lectures/lecture03/toxic_comment/train.csv")
df.fillna("", inplace=True)

In [166]:
schema = StructType([
    StructField("id", StringType()),
    StructField("comment_text", StringType()),
    StructField("toxic", IntegerType()),
    StructField("severe_toxic", IntegerType()),
    StructField("obscene", IntegerType()),
    StructField("threat", IntegerType()),
    StructField("insult", IntegerType()),
    StructField("identity_hate", IntegerType())
])

In [167]:
dataset = spark.createDataFrame(df, schema=schema)

In [168]:
dataset.rdd.getNumPartitions()

2

In [169]:
dataset = dataset.repartition(4).cache()

In [170]:
dataset

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int]

In [171]:
from pyspark.ml.feature import *

In [172]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")

In [173]:
stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [174]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)

In [175]:
count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vector", vocabSize=200)

In [176]:
from pyspark.ml import Pipeline

In [177]:
preprocessing = Pipeline(stages=[
    tokenizer,
    swr,
    count_vectorizer
])

In [178]:
preprocessing_model = preprocessing.fit(dataset)

In [179]:
preprocessed_dataset = preprocessing_model.transform(dataset)

In [180]:
preprocessed_dataset.select(["word_vector"]).take(5)

[Row(word_vector=SparseVector(200, {0: 6.0, 1: 1.0, 70: 1.0, 78: 1.0, 101: 1.0, 179: 1.0})),
 Row(word_vector=SparseVector(200, {0: 2.0, 2: 1.0, 6: 1.0, 15: 1.0, 19: 2.0, 20: 1.0, 22: 1.0, 33: 1.0, 64: 1.0, 66: 1.0, 69: 1.0, 80: 1.0, 89: 1.0, 108: 1.0, 109: 1.0, 114: 1.0, 128: 1.0, 133: 1.0, 152: 1.0, 157: 1.0})),
 Row(word_vector=SparseVector(200, {0: 5.0, 1: 2.0, 59: 1.0, 69: 1.0, 87: 1.0, 100: 1.0, 194: 3.0, 198: 2.0})),
 Row(word_vector=SparseVector(200, {0: 3.0, 4: 1.0, 66: 1.0, 67: 1.0, 83: 1.0, 90: 1.0, 151: 1.0})),
 Row(word_vector=SparseVector(200, {0: 8.0, 1: 2.0, 2: 5.0, 3: 1.0, 4: 2.0, 9: 1.0, 10: 1.0, 11: 1.0, 14: 3.0, 21: 1.0, 31: 1.0, 46: 1.0, 55: 1.0, 59: 4.0, 63: 1.0, 65: 1.0, 67: 1.0, 69: 3.0, 100: 1.0, 106: 1.0, 108: 1.0, 121: 6.0, 160: 2.0, 168: 2.0, 171: 2.0}))]

In [181]:
dataset

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int]

In [182]:
from pyspark.ml.clustering import KMeans

In [183]:
kmeans = KMeans(featuresCol="word_vector", k=6, seed=5757)

In [184]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [52]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [58]:
clustering[clustering.columns[2:8] + ["prediction"]].take(10)

[Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=1, severe_toxic=1, obscene=1, threat=0, insult=1, identity_hate=1, prediction=0),
 Row(toxic=1, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=1, threat=0, insult=1, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0)]

In [59]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [60]:
evaluator = ClusteringEvaluator(featuresCol="word_vector")

In [61]:
evaluator.evaluate(clustering)

0.5840461751006454

In [63]:
clustering.filter(clustering.prediction == 1)[["comment_text"]].take(5)

[Row(comment_text='"Contents of the library (objects and functions to be used outside, situation\nlate August 2004)\n\nClasses:\nPage: A MediaWiki page\n    __init__               Page(Site, Title) - the page with title Title on wikimedia site Site\n    title                  The name of the page, in a form suitable for an interwiki link\n    urlname                The name of the page, in a form suitable for a URL\n    titleWithoutNamespace  The name of the page, with the namespace part removed\n    section                The section of the page (the part of the name after \'#\')\n    sectionFreeTitle       The name without the section part\n    aslink                 The name of the page in the form Title or lang:Title\n    site                   The wiki this page is in\n    encoding               The encoding of the page\n    isAutoTitle            If the title is a well known, auto-translatable title\n    autoFormat             Returns (dictName, value), where value can be a year,

In [64]:
kmeans = KMeans(featuresCol="word_vector", k=2)

In [65]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [66]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [67]:
evaluator.evaluate(clustering)

0.9994404484935098

In [76]:
kmeans_model.clusterCenters()

[array([3.0109565 , 0.51180268, 0.24449041, 0.17984831, 0.17243325,
        0.16500564, 0.15431867, 0.14628306, 0.14062304, 0.14034725,
        0.11886674, 0.11762567, 0.10255735, 0.10247587, 0.09564999,
        0.09480381, 0.0941394 , 0.09082362, 0.08216748, 0.08130876,
        0.07923405, 0.07139275, 0.07050896, 0.06879153, 0.06503071,
        0.06314404, 0.06249216, 0.06222264, 0.06039865, 0.05926413,
        0.05809828, 0.05765325, 0.05692616, 0.0554093 , 0.05474489,
        0.0546446 , 0.05430613, 0.05344114, 0.05312147, 0.05280807,
        0.05084618, 0.05274539, 0.05250094, 0.05196816, 0.0514103 ,
        0.05116585, 0.05008148, 0.04925411, 0.04914128, 0.04869625,
        0.04811959, 0.0476871 , 0.04764949, 0.04752413, 0.04740504,
        0.04605115, 0.04578789, 0.04533659, 0.04526138, 0.0452175 ,
        0.0446095 , 0.04420835, 0.04403284, 0.04393882, 0.04333709,
        0.04328695, 0.0428858 , 0.04227153, 0.04097405, 0.04091764,
        0.04075467, 0.04058543, 0.04028457, 0.04

In [77]:
import numpy as np

In [82]:
np.max(kmeans_model.clusterCenters()[0])

3.0109564999373197

In [86]:
np.argsort(-kmeans_model.clusterCenters()[0])

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  41,  42,  43,  44,  45,  40,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  87,  86,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 125, 124, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 137, 136, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156,
       157, 158, 159, 153, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 175, 174, 176, 177, 178, 179, 18

In [87]:
np.argsort(-kmeans_model.clusterCenters()[1])

array([  0,  40, 153,   3,   7,   1,  18,  13,   6,   5, 159,   2,  36,
        55,  47,  12,  15,  16, 174, 140,  20,  48, 194,  58,  26,  29,
        23,  10,  22, 142,  96,  53,  56,  49,  50, 164, 124,  86,  88,
        91,  97, 107, 115, 128,  99,  17,  42,  41,  11,  27,  19,  24,
        85,   8,  30, 156,  14, 104,  61,  60,  44,  34, 117,  39, 134,
       109, 135,  81, 138,  35,  54, 113, 143, 106,  31,  33,  82, 166,
       133,  93,  45,  79,  63,  62,  80,  67,  68, 186, 192,  66, 177,
         4,  51,  43,  46,  73,  75,  78,  65, 136, 193, 141, 190, 130,
       129, 196, 151, 146, 148, 152, 155, 157, 158, 181, 180, 161, 162,
         9, 165, 173, 171, 188, 167, 122, 199,  98,  71,  70,  74,  32,
        69, 110, 111,  92, 101,  77,  64,  25,  59, 118,  57, 119,  37,
        38,  52, 114,  72, 170, 172,  76, 169, 168, 175, 184, 178, 179,
       182, 183, 185, 187, 189, 191, 195, 197, 176,  83, 121,  87, 125,
       126, 127,  21, 120, 131, 132, 116, 137, 112, 139, 108, 10

In [91]:
preprocessing_model.stages[2].vocabulary

['',
 '"',
 'article',
 'page',
 'please',
 'like',
 'one',
 '-',
 'wikipedia',
 'talk',
 'think',
 'see',
 'also',
 'know',
 'may',
 'edit',
 'people',
 'use',
 'get',
 'even',
 'make',
 'articles',
 'good',
 'want',
 'time',
 'it.',
 'need',
 'new',
 'thank',
 'go',
 'first',
 'information',
 'many',
 'made',
 'find',
 'page.',
 'name',
 'really',
 'thanks',
 'say',
 'fuck',
 'much',
 'used',
 'since',
 'article.',
 'user',
 'add',
 'way',
 'take',
 'help',
 'sources',
 'look',
 'someone',
 'still',
 'read',
 'section',
 'pages',
 'going',
 'two',
 'deletion',
 'you.',
 'source',
 'edits',
 'without',
 'discussion',
 'well',
 'editing',
 'wikipedia.',
 'point',
 'deleted',
 'back',
 'might',
 'work',
 'something',
 'image',
 'another',
 'added',
 'never',
 'put',
 'link',
 'seems',
 'stop',
 ',',
 'blocked',
 'feel',
 '.',
 'list',
 'block',
 'right',
 'said',
 '(utc)',
 'using',
 'ask',
 'personal',
 'fact',
 'sure',
 'article,',
 'believe',
 'hope',
 'page,',
 'note',
 'actually',


In [96]:
for i in np.argsort(-kmeans_model.clusterCenters()[1])[:10]:
    print(preprocessing_model.stages[2].vocabulary[i])


fuck
|
page
-
"
get
know
one
like


In [97]:
for i in np.argsort(-kmeans_model.clusterCenters()[0])[:10]:
    print(preprocessing_model.stages[2].vocabulary[i])


"
article
page
please
like
one
-
wikipedia
talk


## The curse of dimensionality
![curse](pics/dimensionality_vs_performance.png)

## Why is that?
![curse](pics/curseofdimensionality.png)

## LDA

In [118]:
from pyspark.ml.clustering import LDA

In [119]:
lda = LDA(featuresCol="word_vector", seed=5757, k=6)

In [120]:
lda_model = lda.fit(preprocessed_dataset)

In [121]:
topics = lda_model.transform(preprocessed_dataset)

In [122]:
topics.take(5)

[Row(id='26e1b63617df36b1', comment_text='"\n\n charlie wilson \n\ni didnt notice the music genres that were reverted. However my intention was to revert his alias that you deleted.  His alias a.k.a is actually ""Uncle Charlie"" and needs to be put back and shouldn\'t  have been removed."', toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, words=['"', '', '', 'charlie', 'wilson', '', '', 'i', 'didnt', 'notice', 'the', 'music', 'genres', 'that', 'were', 'reverted.', 'however', 'my', 'intention', 'was', 'to', 'revert', 'his', 'alias', 'that', 'you', 'deleted.', '', 'his', 'alias', 'a.k.a', 'is', 'actually', '""uncle', 'charlie""', 'and', 'needs', 'to', 'be', 'put', 'back', 'and', "shouldn't", '', 'have', 'been', 'removed."'], words_filtered=['"', '', '', 'charlie', 'wilson', '', '', 'didnt', 'notice', 'music', 'genres', 'reverted.', 'however', 'intention', 'revert', 'alias', 'deleted.', '', 'alias', 'a.k.a', 'actually', '""uncle', 'charlie""', 'needs', 'put', 'back

In [124]:
lda_model.vocabSize()

10000

In [127]:
lda_model.describeTopics().collect()

[Row(topic=0, termIndices=[214, 211, 5, 463, 582, 334, 131, 751, 700, 1355], termWeights=[0.043883763487320766, 0.021283578796324473, 0.020836523084179093, 0.019396636625816983, 0.019349235472007363, 0.018801192071946627, 0.015804631416439487, 0.014914671745475647, 0.01359265539904886, 0.013339323640011343]),
 Row(topic=1, termIndices=[0, 2, 1, 3, 4, 10, 7, 14, 8, 11], termWeights=[0.03964717773128192, 0.01759147883131059, 0.01027451466047655, 0.008121462633726536, 0.00781864276000534, 0.005940282263819784, 0.005706721848796293, 0.005682362549471249, 0.005398311182518471, 0.00521945071304873]),
 Row(topic=2, termIndices=[40, 0, 379, 257, 249, 29, 359, 567, 474, 153], termWeights=[0.0686108578826063, 0.026297949145799134, 0.022451048433182883, 0.01996391934306116, 0.019469845828923173, 0.019217280267994158, 0.01706127934740466, 0.01622127296048629, 0.01471626906998474, 0.013436059956307917]),
 Row(topic=3, termIndices=[0, 1, 4, 3, 9, 5, 8, 2, 15, 6], termWeights=[0.17251287835326581, 0.

In [130]:
for i in [214, 211, 5, 463, 582, 334, 131, 751, 700, 1355]:
    print(preprocessing_model.stages[-1].vocabulary[i])

nigger
fucking
like
moron
sucks
redirect
hi
dick
jews
fucksex


In [131]:
for i in [0, 2, 1, 3, 4, 10, 7, 14, 8, 11]:
    print(preprocessing_model.stages[-1].vocabulary[i])


article
"
page
please
think
-
may
wikipedia
see


In [132]:
for i in [40, 0, 379, 257, 249, 29, 359, 567, 474, 153]:
    print(preprocessing_model.stages[-1].vocabulary[i])

fuck

fat
shit
suck
go
gay
jew
ass
|


In [133]:
for i in [0, 105, 1107, 175, 13, 18, 36, 57, 8, 1]:
    print(preprocessing_model.stages[-1].vocabulary[i])


•
tacos
u
know
get
name
going
wikipedia
"


## Clustering is a good dimensionality reduction technique

In [137]:
topics

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int, words: array<string>, words_filtered: array<string>, word_vector: vector, topicDistribution: vector]

In [139]:
from pyspark.sql import functions as f

In [140]:
target = f.when(
    (topics.toxic == 0) &
    (topics.severe_toxic == 0) &
    (topics.obscene == 0) &
    (topics.threat == 0) &
    (topics.insult == 0) &
    (topics.identity_hate == 0),
    0
).otherwise(1)

In [144]:
new_dataset = topics.withColumn("target", target)[["id", "target", "topicDistribution"]].cache()

In [145]:
new_dataset.take(5)

[Row(id='6fdb7b6734f8bf40', target=0, topicDistribution=DenseVector([0.0052, 0.9694, 0.0053, 0.0082, 0.0056, 0.0063])),
 Row(id='39b742437bd11ec9', target=0, topicDistribution=DenseVector([0.0025, 0.0031, 0.0025, 0.0039, 0.0027, 0.9853])),
 Row(id='9bbb8e1922fe1efb', target=0, topicDistribution=DenseVector([0.0665, 0.0836, 0.0674, 0.1029, 0.0708, 0.6088])),
 Row(id='54f9e59924682c6e', target=0, topicDistribution=DenseVector([0.009, 0.0113, 0.0091, 0.9501, 0.0096, 0.0108])),
 Row(id='62e38775721eb79e', target=1, topicDistribution=DenseVector([0.0127, 0.0159, 0.4886, 0.0196, 0.0135, 0.4497]))]

In [146]:
from pyspark.ml.classification import LogisticRegression

In [147]:
lr = LogisticRegression(featuresCol="topicDistribution", labelCol="target")

In [150]:
train = new_dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}, seed=5757).cache()

In [151]:
test = new_dataset.join(train, on="id", how="leftanti").cache()

In [153]:
lr_model = lr.fit(train)

In [154]:
predictions = lr_model.transform(test)

In [155]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [156]:
evaluator = BinaryClassificationEvaluator(labelCol="target")

In [157]:
evaluator.evaluate(predictions)

0.8381544862513701

## Last time with CountVectorizer with 20k words in vocabulary we got 0.8275751487175559