# Load libraries and data

In [1]:
import json

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
df_subset = pd.read_pickle("../data/df_subset.pkl")

In [3]:
df_subset.head()

Unnamed: 0,Title,Path,Links,Text,Category,Text_processed
0,Climate change denial,"[Global_warming, Climate change, Climate chang...","[2010 Russian wildfires, 2015 United Nations C...","Climate change denial, or global warming denia...",Climate change denial,"climate change denial, or global warming denia..."
1,Khabibullo Abdussamatov,"[Global_warming, Climate change, Climate chang...","[Astrophysics, Bibcode, Cambridge University P...",Habibullo Ismailovich Abdussamatov (Russian: Х...,Climate change denial,habibullo ismailovich abdussamatov (russian: х...
2,Accuracy in Media,"[Global_warming, Climate change, Climate chang...","[2012 Benghazi attack, Advocacy journalism, Ag...",Accuracy in Media (AIM) is an American non-pro...,Climate change denial,accuracy in media (aim) is an american non-pro...
3,Robert Aderholt,"[Global_warming, Climate change, Climate chang...","[105th United States Congress, 106th United St...","Robert Brown Aderholt (born July 22, 1965) is ...",Climate change denial,"robert brown aderholt (born july 22, 1965) is ..."
4,Jerry Agar,"[Global_warming, Climate change, Climate chang...","[CFRB, Chicago, Disc jockey, G. Gordon Liddy, ...",Jerry Agar is a conservative talk radio person...,Climate change denial,jerry agar is a conservative talk radio person...


In [4]:
def jaccard_similarity(list1, list2):
    try:
        s1 = set(list1)
        s2 = set(list2)
    except:
        print(list1)
        print(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [5]:
df_path = df_subset[['Title', 'Path']].sort_values(by=['Title']).reset_index().drop(columns='index')

In [6]:
df_path.head()

Unnamed: 0,Title,Path
0,"2,5-Dimethylfuran","[Global_warming, Climate change, Politics of c..."
1,2-Methylfuran,"[Global_warming, Climate change, Politics of c..."
2,2007–08 world food price crisis,"[Global_warming, Climate change, Politics of c..."
3,ASEAN Wildlife Enforcement Network,"[Global_warming, Climate change, Politics of c..."
4,Accuracy in Media,"[Global_warming, Climate change, Climate chang..."


In [7]:
import pyspark
import pyspark.sql
from pyspark.sql import *
from pyspark.sql.functions import *

conf = pyspark.SparkConf().setMaster("local[*]").setAll([
                                   ('spark.executor.memory', '12g'),  # find
                                   ('spark.driver.memory','4g'), # your
                                   ('spark.driver.maxResultSize', '2G') # setup
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

# FIX for Spark 2.x
locale = sc._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))

In [8]:
spark_df = spark.createDataFrame(df_path)

In [9]:
spark_df_path_crosses = spark_df.crossJoin(spark_df\
                                            .withColumnRenamed('Title', 'Title2')
                                            .withColumnRenamed('Path', 'Path2'))\
                                 .sort(asc('Title'))\
                                 .persist()
spark_df_path_crosses.show()

+-----------------+--------------------+--------------------+--------------------+
|            Title|                Path|              Title2|               Path2|
+-----------------+--------------------+--------------------+--------------------+
|2,5-Dimethylfuran|[Global_warming, ...|   2,5-Dimethylfuran|[Global_warming, ...|
|2,5-Dimethylfuran|[Global_warming, ...|       2-Methylfuran|[Global_warming, ...|
|2,5-Dimethylfuran|[Global_warming, ...|2007–08 world foo...|[Global_warming, ...|
|2,5-Dimethylfuran|[Global_warming, ...|ASEAN Wildlife En...|[Global_warming, ...|
|2,5-Dimethylfuran|[Global_warming, ...|   Accuracy in Media|[Global_warming, ...|
|2,5-Dimethylfuran|[Global_warming, ...|Active fire prote...|[Global_warming, ...|
|2,5-Dimethylfuran|[Global_warming, ...|Adrian Berry, 4th...|[Global_warming, ...|
|2,5-Dimethylfuran|[Global_warming, ...|Aerated static pi...|[Global_warming, ...|
|2,5-Dimethylfuran|[Global_warming, ...|Aerobic methane p...|[Global_warming, ...|
|2,5

In [10]:
jaccard = spark_df_path_crosses.rdd.map(lambda x: jaccard_similarity(x.Path, x.Path2)).collect()

In [11]:
jaccard = np.array(jaccard).reshape(len(df_path), len(df_path))
jaccard.shape

(937, 937)

In [12]:
df_path['Jaccard'] = jaccard.tolist()

In [13]:
df_path.head()

Unnamed: 0,Title,Path,Jaccard
0,"2,5-Dimethylfuran","[Global_warming, Climate change, Politics of c...","[1.0, 1.0, 1.0, 0.7142857142857143, 0.14285714..."
1,2-Methylfuran,"[Global_warming, Climate change, Politics of c...","[1.0, 1.0, 1.0, 0.7142857142857143, 0.14285714..."
2,2007–08 world food price crisis,"[Global_warming, Climate change, Politics of c...","[1.0, 1.0, 1.0, 0.7142857142857143, 0.14285714..."
3,ASEAN Wildlife Enforcement Network,"[Global_warming, Climate change, Politics of c...","[0.7142857142857143, 0.7142857142857143, 0.714..."
4,Accuracy in Media,"[Global_warming, Climate change, Climate chang...","[0.14285714285714285, 0.14285714285714285, 0.1..."


In [14]:
df_path.to_csv("../data/df_category_sim.csv")

# Test

In [89]:
df_test = spark_df_path_crosses.toPandas()

In [91]:
df_test

Unnamed: 0,Title,Path,Title2,Path2
0,"2,5-Dimethylfuran","[Global_warming, Climate change, Politics of c...","2,5-Dimethylfuran","[Global_warming, Climate change, Politics of c..."
1,"2,5-Dimethylfuran","[Global_warming, Climate change, Politics of c...",2-Methylfuran,"[Global_warming, Climate change, Politics of c..."
2,"2,5-Dimethylfuran","[Global_warming, Climate change, Politics of c...",2007–08 world food price crisis,"[Global_warming, Climate change, Politics of c..."
3,"2,5-Dimethylfuran","[Global_warming, Climate change, Politics of c...",ASEAN Wildlife Enforcement Network,"[Global_warming, Climate change, Politics of c..."
4,"2,5-Dimethylfuran","[Global_warming, Climate change, Politics of c...",Accuracy in Media,"[Global_warming, Climate change, Climate chang..."
...,...,...,...,...
877964,Zion Lights,"[Global_warming, Climate change, Climate chang...",Yulia Latynina,"[Global_warming, Climate change, Climate chang..."
877965,Zion Lights,"[Global_warming, Climate change, Climate chang...",ZEGG (community),"[Global_warming, Climate change, Politics of c..."
877966,Zion Lights,"[Global_warming, Climate change, Climate chang...",Zeolite,"[Global_warming, Climate change, Politics of c..."
877967,Zion Lights,"[Global_warming, Climate change, Climate chang...",Zero waste,"[Global_warming, Climate change, Politics of c..."


In [92]:
jaccard_test = df_test.apply(lambda x: jaccard_similarity(x[1], x[3]), axis=1)

In [93]:
df_test['Jaccard'] = jaccard_test

In [94]:
df_test[df_test['Title'] == 'Climate change denial'].iloc[0]['Jaccard']

110954    0.142857
110955    0.142857
110956    0.142857
110957    0.142857
110958    1.000000
            ...   
161827    1.000000
161828    0.142857
161829    0.142857
161830    0.153846
161831    0.333333
Name: Jaccard, Length: 937, dtype: float64

In [43]:
test = df_test.groupby('Title')[['Path', 'Jaccard']].agg(list).reset_index()

In [59]:
test[test['Title'] == 'Climate change denial'].iloc[0]['Jaccard']

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0