In [4]:
import pandas as pd
import numpy as np

In [5]:
df_subset = pd.read_pickle("../data/df_subset.pkl")

In [6]:
def jaccard_similarity(list1, list2):
    try:
        s1 = set(list1)
        s2 = set(list2)
    except:
        print(list1)
        print(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [7]:
df_links = df_subset[['Title', 'Links']].sort_values(by=['Title']).reset_index().drop(columns='index')

## Spark

In [8]:
import pyspark
import pyspark.sql
from pyspark.sql import *
from pyspark.sql.functions import *

conf = pyspark.SparkConf().setMaster("local[*]").setAll([
                                   ('spark.executor.memory', '12g'),  # find
                                   ('spark.driver.memory','4g'), # your
                                   ('spark.driver.maxResultSize', '2G') # setup
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

# FIX for Spark 2.x
locale = sc._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))

In [9]:
spark_df = spark.createDataFrame(df_links)

In [10]:
spark_df_links_crosses = spark_df.crossJoin(spark_df\
                                            .withColumnRenamed('Title', 'Title2')
                                            .withColumnRenamed('Links', 'Links2'))\
                                 .sort(asc('Title'))\
                                 .persist()
spark_df_links_crosses.show()

+-----------------+--------------------+--------------------+--------------------+
|            Title|               Links|              Title2|              Links2|
+-----------------+--------------------+--------------------+--------------------+
|2,5-Dimethylfuran|[2,5-Dimethylfura...|   2,5-Dimethylfuran|[2,5-Dimethylfura...|
|2,5-Dimethylfuran|[2,5-Dimethylfura...|       2-Methylfuran|[2-Methylfuran (d...|
|2,5-Dimethylfuran|[2,5-Dimethylfura...|2007–08 world foo...|[2000s commoditie...|
|2,5-Dimethylfuran|[2,5-Dimethylfura...|ASEAN Wildlife En...|[ASEAN, ASEAN Cen...|
|2,5-Dimethylfuran|[2,5-Dimethylfura...|   Accuracy in Media|[2012 Benghazi at...|
|2,5-Dimethylfuran|[2,5-Dimethylfura...|Active fire prote...|[Access control, ...|
|2,5-Dimethylfuran|[2,5-Dimethylfura...|Adrian Berry, 4th...|[Adrian Douglas B...|
|2,5-Dimethylfuran|[2,5-Dimethylfura...|Aerated static pi...|[Aeration, Anaero...|
|2,5-Dimethylfuran|[2,5-Dimethylfura...|Aerobic methane p...|[Anaerobic respir...|
|2,5

In [11]:
jaccard = spark_df_links_crosses.rdd.map(lambda x: jaccard_similarity(x.Links, x.Links2)).collect()

In [12]:
jaccard = np.array(jaccard).reshape(len(df_links), len(df_links))

In [13]:
jaccard.shape

(937, 937)

In [14]:
df_links['Jaccard'] = jaccard.tolist()

In [15]:
df_links.to_csv("../data/df_links_sim.csv")