In [1]:
import pandas as pd
import numpy as np

In [2]:
df_subset = pd.read_pickle("../data/df_subset.pkl")

In [3]:
def jaccard_similarity(list1, list2):
    try:
        s1 = set(list1)
        s2 = set(list2)
    except:
        print(list1)
        print(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [4]:
df_links = df_subset[['Title', 'Links']]

## Straightforward

In [None]:
jaccard_sim = np.empty((len(df_links), len(df_links)))
for i, row1 in enumerate(df_links.itertuples()):
    for j, row2 in enumerate(df_links.itertuples()):
        jaccard_sim[i, j] = jaccard_similarity(row1.Links, row2.Links)

## Spark

In [5]:
import pyspark
import pyspark.sql
from pyspark.sql import *
from pyspark.sql.functions import *

conf = pyspark.SparkConf().setMaster("local[*]").setAll([
                                   ('spark.executor.memory', '12g'),  # find
                                   ('spark.driver.memory','4g'), # your
                                   ('spark.driver.maxResultSize', '2G') # setup
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

# FIX for Spark 2.x
locale = sc._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))

In [6]:
spark_df = spark.createDataFrame(df_links)

In [7]:
spark_df_links_crosses = spark_df.crossJoin(spark_df\
                                            .withColumnRenamed('Title', 'Title2')
                                            .withColumnRenamed('Links', 'Links2'))\
                                 .persist()
spark_df_links_crosses.show()

+--------------------+--------------------+--------------------+--------------------+
|               Title|               Links|              Title2|              Links2|
+--------------------+--------------------+--------------------+--------------------+
|Climate change de...|[2010 Russian wil...|Climate change de...|[2010 Russian wil...|
|Climate change de...|[2010 Russian wil...|Khabibullo Abduss...|[Astrophysics, Bi...|
|Climate change de...|[2010 Russian wil...|   Accuracy in Media|[2012 Benghazi at...|
|Climate change de...|[2010 Russian wil...|     Robert Aderholt|[105th United Sta...|
|Climate change de...|[2010 Russian wil...|          Jerry Agar|[CFRB, Chicago, D...|
|Climate change de...|[2010 Russian wil...|Alexis de Tocquev...|[501(c)(3), ACT! ...|
|Climate change de...|[2010 Russian wil...|Alternative for G...|[2013 Bavaria sta...|
|Climate change de...|[2010 Russian wil...|American Farm Bur...|[AFL–CIO, Agribus...|
|Climate change de...|[2010 Russian wil...|American Pe

In [8]:
jaccard = spark_df_links_crosses.rdd.map(lambda x: jaccard_similarity(x.Links, x.Links2)).collect()

In [11]:
jaccard = np.array(jaccard).reshape(len(df_links), len(df_links))

In [12]:
jaccard.shape

(937, 937)

In [14]:
df_links['Jaccard'] = jaccard.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
df_links.to_csv("../data/df_jaccard_sim.csv")