In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import format_number, mean, min, max, corr, stddev
from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year, weekofyear, format_number, date_format, asc, desc)
from pyspark.sql.functions import explode, col, element_at, size, split
from pyspark.sql.functions import udf
import numpy as np
import matplotlib.pyplot as plt
#https://www.data4v.com/tutorial-hierarchical-clustering-in-spark-with-bisecting-k-means/

In [2]:
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler

from pyspark.ml.clustering import KMeans, BisectingKMeans


In [3]:
spark = SparkSession.builder \
    .appName('bisectingK') \
    .master('local[*]') \
    .config('spark.sql.execution.arrow.pyspark.enabled', True) \
    .config('spark.sql.session.timeZone', 'UTC') \
    .config('spark.driver.memory','8g') \
    .config('spark.ui.showConsoleProgress', True) \
    .config('spark.sql.repl.eagerEval.enabled', True) \
    .getOrCreate()

In [4]:
# Dummy table created ny AL_read_papers notebook
# Original data is first 30000 papers from 2010 with 50 assigned keywords
paps = spark.read.json("../data/processed/papers_2010/")

In [5]:
#Adding an ID to each paper so the abstract data analysis can be attributed to a paper
from pyspark.sql.functions import monotonically_increasing_id
papersWIDs = paps.withColumn("id", monotonically_increasing_id())
# remving nans
print(papersWIDs.count())
papersWIDs_woNA = papersWIDs.dropna()
print(papersWIDs_woNA.count())
papersWIDs_woNA_woNone = papersWIDs_woNA.filter("None == 0")
print(papersWIDs_woNA_woNone.count())

3000
2996
2310


In [109]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [276]:
ignore = ["title", "id", "num_refs"]
inputColumns = [c for c in papersWIDs_woNA.columns if not c in ignore]
inputData = papersWIDs_woNA.select(inputColumns).toPandas()
#inputData = MinMaxScaler().fit_transform(inputData)

In [277]:
dfDBScan = pd.DataFrame()
for eps in [0.1, 0.2, 0.4, 0.8, 1.0, 2.0, 4.0, 8.0, 10, 20, 40, 80]:
    print(".", end="")
    for min_samples in range(2, 40):
        clustering = DBSCAN(eps=eps,  min_samples=min_samples).fit(inputData)
        labels = clustering.labels_
        dfDBScan = pd.concat([
            dfDBScan,
            pd.DataFrame.from_dict({
                "eps":[eps], 
                "min_samples":[min_samples], 
                "outliers":[sum(labels<0)], 
                "clusters":[ len(set(labels))-1]})
        ])
dfDBScan   

............

Unnamed: 0,eps,min_samples,outliers,clusters
0,0.1,2,2462,127
0,0.1,3,2592,62
0,0.1,4,2670,36
0,0.1,5,2702,28
0,0.1,6,2732,22
...,...,...,...,...
0,80.0,35,19,1
0,80.0,36,20,1
0,80.0,37,20,1
0,80.0,38,23,1


In [279]:
for eps in np.arange(0.01, 0.1, 0.01):
    print(".", end="")
    for min_samples in range(2, 40):
        clustering = DBSCAN(eps=eps,  min_samples=min_samples).fit(inputData)
        labels = clustering.labels_
        dfDBScan = pd.concat([
            dfDBScan,
            pd.DataFrame.from_dict({
                "eps":[eps], 
                "min_samples":[min_samples], 
                "outliers":[sum(labels<0)], 
                "clusters":[ len(set(labels))-1]})
        ])

.........

In [211]:
# import seaborn as sns

# sns.scatterplot(x="clusters", y="outliers", size="min_samples", hue = "eps", data = dfDBScan)

In [290]:
eps, min_samples = 10.5, 3
clustering = DBSCAN(eps=eps,  min_samples=min_samples).fit(inputData)
labels = clustering.labels_
{"eps":eps, "min_samples":min_samples, "outliers":sum(labels<0), "clusters": len(set(labels))-1}

{'eps': 10.5, 'min_samples': 3, 'outliers': 52, 'clusters': 11}

In [291]:
pd.DataFrame(labels, columns=["L"]).groupby("L")["L"].count().sort_values(ascending = False)

L
 0     2871
-1       52
 4       22
 3       15
 2        7
 1        6
 7        5
 8        5
 5        4
 6        3
 9        3
 10       3
Name: L, dtype: int64

In [302]:
dfDBScan.query("5<=clusters & clusters<=10 & outliers<400").groupby("clusters")["clusters"].count()

clusters
5    3
6    1
7    1
8    2
Name: clusters, dtype: int64