In [5]:
import os
import sys

spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()

import pyspark

In [6]:
number_cores = 8
memory_gb = 8
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [7]:
from pyspark.ml.linalg import Vectors
data = [(Vectors.dense([0.0, 0.0]), 2.0),
        (Vectors.dense([1.0, 1.0]), 2.0),
        (Vectors.dense([9.0, 8.0]), 2.0),
        (Vectors.dense([8.0, 9.0]), 2.0)]

In [8]:
from pyspark.sql import SQLContext
sqlContext= SQLContext(sc)
df = sqlContext.createDataFrame(data, ["features", "weighCol"])

In [9]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=2)
kmeans.setSeed(1)
kmeans.setWeightCol("weighCol")
kmeans.setMaxIter(10)
kmeans.getMaxIter()
# 10

10

In [10]:
kmeans.clear(kmeans.maxIter)
model = kmeans.fit(df)
model.getDistanceMeasure()

'euclidean'

In [11]:
model.setPredictionCol("newPrediction")
model.predict(df.head().features)

0

In [12]:
centers = model.clusterCenters()
len(centers)

2

In [13]:
transformed = model.transform(df).select("features", "newPrediction")
rows = transformed.collect()
rows

[Row(features=DenseVector([0.0, 0.0]), newPrediction=0),
 Row(features=DenseVector([1.0, 1.0]), newPrediction=0),
 Row(features=DenseVector([9.0, 8.0]), newPrediction=1),
 Row(features=DenseVector([8.0, 9.0]), newPrediction=1)]

## Run the model for Bisecting KMeans based on the API

In [14]:
# dataset: [ ([value1, value2, ...], weight), ( [...], weight2 ), ... ]

data = [(Vectors.dense([0.0, 0.0]), 2.0), 
        (Vectors.dense([1.0, 1.0]), 2.0),
        (Vectors.dense([9.0, 8.0]), 2.0), 
        (Vectors.dense([8.0, 9.0]), 2.0)]

In [15]:
df = sqlContext.createDataFrame(data, ["features", "weighCol"])

In [16]:
from pyspark.ml.clustering import BisectingKMeans

# k is the number of clusters
bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0)

In [17]:
bkm.setMaxIter(10)
bkm.getMaxIter()

10

In [18]:
bkm.clear(bkm.maxIter)

In [19]:
bkm.setSeed(1)

BisectingKMeans_9ae7028a23dd

In [20]:
bkm.setWeightCol("weighCol")

BisectingKMeans_9ae7028a23dd

In [21]:
bkm.getSeed()

1

In [22]:
bkm.clear(bkm.seed)

In [23]:
model = bkm.fit(df)

In [24]:
model.getMaxIter()

20

In [25]:
model.setPredictionCol("newPrediction")
model.predict(df.head().features)

0

In [26]:
centers = model.clusterCenters()

In [27]:
len(centers)

2

### Reading and interpreting output

In [28]:
model.computeCost(df)

2.0

In [29]:
model.hasSummary

True

In [30]:
summary = model.summary

In [31]:
summary.k

2

In [32]:
summary.clusterSizes

[2, 2]

In [33]:
summary.trainingCost

4.000000000000114

In [34]:
transformed = model.transform(df).select("features", "newPrediction")

In [35]:
rows = transformed.collect()

In [36]:
rows[0].newPrediction == rows[1].newPrediction

True

In [37]:
rows[2].newPrediction == rows[3].newPrediction

True

### Saving Data:

In [38]:
temp_path = './data/clustering'

In [39]:
bkm_path = temp_path + "/bkm"

In [40]:
bkm.save(bkm_path)

Py4JJavaError: An error occurred while calling o257.save.
: java.io.IOException: Path ./data/clustering/bkm already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
bkm2 = BisectingKMeans.load(bkm_path)

In [None]:
bkm2.getK()

In [None]:
bkm2.getDistanceMeasure()

In [None]:
model_path = temp_path + "/bkm_model"

In [None]:
model.save(model_path)

In [None]:
model2 = BisectingKMeansModel.load(model_path)

In [None]:
#model2.hasSummary

In [None]:
#model.clusterCenters()[0] == model2.clusterCenters()[0]

In [None]:
#model.clusterCenters()[1] == model2.clusterCenters()[1]

In [None]:
#model.transform(df).take(1) == model2.transform(df).take(1)

## Reddit Crypto Data

- Download the [Reddit Crypto data](https://www.cs.wcupa.edu/lngo/data2/reddit_crypto.zip)
- Unzip the crypto data.
- Review [the metadata](https://www.kaggle.com/pavellexyr/reddit-cryptocurrency-data-for-august-2021)
- Perform a K-mean clustering on the text of posts and comments. Address the followings:
  - What data point to remove, what not to remove?
  - How to clean up text data?
  - How to make the resulting cluster data more meaningful?

In [41]:
spark = pyspark.sql.SparkSession(sc)
df_posts = spark.read.csv("./data/reddit_crypto/crypto-aug-2021-posts.csv", 
                          header=True, 
                          inferSchema=True, 
                          multiLine=True, 
                          escape='"').cache()

In [42]:
df_posts.count()

250569

In [43]:
df_posts.take(10)

[Row(type='post', id='pfi1nw', subreddit.id='9e4pv', subreddit.name='cryptomoonshots', subreddit.nsfw=False, created_utc=1630454394, permalink='https://old.reddit.com/r/CryptoMoonShots/comments/pfi1nw/next_generation_safe_token_with_cake_rewards_meth/', domain='self.cryptomoonshots', url=None, selftext="⚜️ 𝓟𝓻𝓸𝓶𝓮𝓽𝓱𝓮𝓾𝓼 ⚜️\n\n&amp;#x200B;\n\n&amp;#x200B;\n\nThursday 8pm UTC #Prometheus will launch. The NFT titans will be combined to the most (in)famous people in the world. The first one who will be Zeus in the battle against others is Elon Musk - otherwise called: Elon Zuskus.\n\n&amp;#x200B;\n\nLet's see what he brings to the battlefield!\n\n&amp;#x200B;\n\n&amp;#x200B;\n\n⚡️ [https://t.me/Prometheus\\_BSC](https://t.me/Prometheus_BSC)\n\n&amp;#x200B;\n\n&amp;#x200B;\n\n&amp;#x200B;\n\n&amp;#x200B;\n\n&amp;#x200B;\n\n——— 🔥: 𝐈𝐧𝐟𝐨 :🔥 ———\n\n&amp;#x200B;\n\n🍰 Never been done before Cross-Chain NFT project with $CAKE rewards!\n\n🖼 NFT Giveaways to the community! (Custom and existing)\n\n♠️ N

In [56]:
filter_list = ['[removed]']
df_clean = df_posts.filter( (df_posts.selftext != '[removed]') & (df_posts.selftext != '[deleted]'))
df_clean.count()

73691

In [64]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.6.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 6.7 MB/s eta 0:00:01
[?25hCollecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 10.7 MB/s eta 0:00:01
[?25hCollecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp36-cp36m-macosx_10_9_x86_64.whl (294 kB)
[K     |████████████████████████████████| 294 kB 19.8 MB/s eta 0:00:01
[?25hCollecting joblib
  Downloading joblib-1.1.1-py2.py3-none-any.whl (309 kB)
[K     |████████████████████████████████| 309 kB 22.2 MB/s eta 0:00:01
[?25hCollecting click
  Downloading click-8.0.4-py3-none-any.whl (97 kB)
[K     |██�███████████████████████████▎ | 92 kB 37.8 MB/s eta 0:00:01�█████████████████████████████| 97 kB 13.4 MB/s 
[?25hCollecting importlib-metadata
  Using cached importlib_metadata-4.8.3-py3-none-any.whl (17 kB)
Collecting zipp>=0.5
  Using cached zipp-3.6.0-py3-none-any.whl (5.3 kB)
Collecting impo

In [69]:
import string

import nltk
from nltk.stem import SnowballStemmer

translator = str.maketrans('', '', string.punctuation)

def filter_txt(s):
    translator = str.maketrans('','',string.punctuation)
    decode_s = s.encode('ascii','ignore').decode('ascii').replace('\n',' ').translate(translator).lower()
    word_list = decode_s.split(' ')
    sn = SnowballStemmer('english')
    for w in word_list:
        if (w.isalpha()) or ('http' not in w):
            stem_w = sn.stem(w)
            if stem_w not in res:
                res[stem_w] = 1
            else:
                res[stem_w] += 1
    return list


printable = set(string.printable)
tokenized_data = df_clean.select('id','selftext').rdd.map(lambda p: 
                                                          (p[0], 
                                                           p[1].encode('ascii','ignore').decode('ascii').replace('\n', ' ').translate(translator).lower()))

tokenized_data.take(5)

[('pfi1nw',
  '    ampx200b  ampx200b  thursday 8pm utc prometheus will launch the nft titans will be combined to the most infamous people in the world the first one who will be zeus in the battle against others is elon musk  otherwise called elon zuskus  ampx200b  lets see what he brings to the battlefield  ampx200b  ampx200b   httpstmeprometheusbschttpstmeprometheusbsc  ampx200b  ampx200b  ampx200b  ampx200b  ampx200b        ampx200b   never been done before crosschain nft project with cake rewards   nft giveaways to the community custom and existing   nft game with custom prometheus nfts in development   real tech development spxcharts and crosschain technology   devs and marketing of previousexisting huge projects  doxxed devs  ampx200b  the community has a voice in bascially ever move the team makes  ampx200b  ampx200b  ampx200b        ampx200b             amp     amp          ampx200b  ampx200b  ampx200b        ampx200b             big funds available prior to launch  ampx200b  a

In [70]:
e = "val car rental valrentalcomhttpsvalrentalcom is a car rental familyowned company in cancun and puerto vallarta recently launched our very own token called valrentalclub val on the binance smart chain bep20 with the plan of making it the backend of our rewards system  how will it work  by launching val we are giving real value to our rewards system val car rental will be adding to the liquidity 1 of its earnings this will represent a constant increase in the tokens value  what other advantage do i have by holding val  valrentalclub is a lottery token 9 of all purchases go into our lottery wallet for a prize of drums    2way flights for 2 to cancun   6 nights at a 5 star allinclusive hotel in the riviera maya   car rental for 7 days   2 day passes to xcaretcenotes   spending money  when will val car rental start adding to liquidity  val car rental will be adding to the liquidity as soon as the rewards system starts working or as soon as the valrentalclub liquidity wallet starts entering the lottery giving our clients the chance to win the trip to the riviera maya  what are the tokenomics for val   1000000000 total supply   9 lottery   3 marketing   3 is lp lockedhttpsdxsaleappappv29dxlockviewid1ampadd0x56d0ce3d1c32d4be796f9f494ab61a6741aa321famptypelplockampchainbsc   2 to holders   15 development   15 vr investments  why that tax distribution  9 lottery tax consider a purchase of the token as a lottery ticket the difference is that as long as you have it in your possession you will enter all lotteries and when you want to sell it you can still get money back not like every other lottery ticket that only works once  3 marketing tax everyone knows that every company needs marketing and so do tokens especially with all those fake tokens out there we need people to know that theres a real and legit token in the market  3 liquidity pool tax the more money goes into our liquidity pool the more our ground level price increases ensuring holders will be benefited  2 reflection tax we really wanted to benefit all our holders not only by entering all lotteries but giving back to them  15 development tax we are working very hard to get this project rolling this includes the creation of our rewards system telegram and chat supports servers upgrades and much more which means hundreds of manhours  15 val rental investment tax being a real company we have created our token not only to benefit our customers and investors but to grow we will be using this tax to expand the more we expandgrow the more money will go back into our token  is there an antiwhale program  yes there is we want to prevent big dumps in our token and thats why we implemented the following anti whale program   anti whale only 10000000 val tokens considered for the lotteries per wallet and no reflection for whale wallets with over 15  what this means is that if they have a wallet with over 10000000 val tokens only 10000000 of those tokens will be considered for the lottery and if they have over 15000000 val tokens they will not receive reflection tokens  join our community   httpstmevaltokenhttpstmevaltoken    httpstwittercomvaltokenhttpstwittercomvaltoken    valrentalclubhttpsvalrentalclub    valrentalcomhttpsvalrentalcom"
filter_txt(e)

NameError: name 'res' is not defined

In [71]:
all_words = tokenized_data.flatMap(lambda p: p[1].split(' ')) \
                        .map(lambda w: (w, 1)) \
                        .reduceByKey(lambda x, y: x + y) \
                        .map(lambda p: p[0]).collect()
all_words = sorted(all_words)

In [72]:
all_words

['',
 '\t',
 '\t\t',
 '\t\t\t\t\t',
 '\t\t\t\t\t\t',
 '\t\t10000',
 '\t\t100m',
 '\t\t1m',
 '\t\t25000',
 '\t\t50000',
 '\t\tapplication',
 '\t\tastromouse',
 '\t\tbeta',
 '\t\tcex',
 '\t\tcheese',
 '\t\tcoding',
 '\t\tcontract',
 '\t\tcrosschain',
 '\t\tdesign',
 '\t\tdeveloper',
 '\t\tdex',
 '\t\tfarming',
 '\t\tfurther',
 '\t\tgrant',
 '\t\thuge',
 '\t\tinfluencer',
 '\t\tinitial',
 '\t\tlaunch',
 '\t\tlaunching',
 '\t\tmarketing',
 '\t\tmarketplace',
 '\t\tnft',
 '\t\tpaid',
 '\t\tpayment',
 '\t\tplanning',
 '\t\tpublic',
 '\t\tsoft',
 '\t\tsubmission',
 '\t\ttopsecret',
 '\t\twebsite',
 '\t\tweekly',
 '\t\twill',
 '\t\twoof',
 '\t\twoofkwoof',
 '\t1',
 '\t10',
 '\t100',
 '\t10000',
 '\t15',
 '\t15k',
 '\t17',
 '\t2',
 '\t20',
 '\t3',
 '\t320',
 '\t4',
 '\t40',
 '\t45',
 '\t5',
 '\t5655',
 '\t6',
 '\t65',
 '\t7',
 '\t8',
 '\t9',
 '\tamonggoat',
 '\tannoucement',
 '\tanother',
 '\tapple',
 '\tapply',
 '\taustraliahttpswwwatogovaugeneralgentaxtreatmentofcryptocurrenciesinaustraliaspe

In [44]:
from pyspark.ml.linalg import Vectors

df_data = df_posts.rdd.map(lambda x: (Vectors.dense([x["score"]]), 1.0))

In [45]:
df_data.take(10)

[(DenseVector([3.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([3.0]), 1.0),
 (DenseVector([11.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([1.0]), 1.0)]

In [46]:
df_kdata = spark.createDataFrame(df_data, ["features", "weighCol"])

In [47]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=7)
kmeans.setSeed(1)
kmeans.setWeightCol("weighCol")
kmeans.setMaxIter(50)

model = kmeans.fit(df_kdata)
model.setPredictionCol("newPrediction")
model.predict(df_kdata.head().features)
transformed = model.transform(df_kdata).select("features", "newPrediction").cache()
transformed.take(100)

[Row(features=DenseVector([3.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([3.0]), newPrediction=0),
 Row(features=DenseVector([11.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([30.0]), newPrediction=0),
 Row(features=DenseVector([5.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([864.0]), newPrediction=5),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([9.0]), newPrediction=0),
 Row(fea

In [48]:
df_tmp = df_posts.rdd.map(lambda x: (x["subreddit.name"],x["score"]))
df_tmp.take(100)

[('cryptomoonshots', 3),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 3),
 ('cryptomoonshots', 11),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 30),
 ('cryptomoonshots', 5),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 864),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 9),
 ('cryptomoonshots', 3),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 727),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 710),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 3026),
 ('cryptomoonshots', 10),
 ('cryptomoonshots', 835),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 0),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomoonshots', 1),
 ('cryptomo

In [49]:
from pyspark.ml.linalg import Vectors

df_data = df_posts.rdd.map( lambda x: (Vectors.dense(x['score']),1.0 ) )

In [50]:
df_data.take(10)

[(DenseVector([3.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([3.0]), 1.0),
 (DenseVector([11.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([1.0]), 1.0)]