<a href="https://colab.research.google.com/github/CoryTee/JaccardDistanceTweets/blob/master/notebooks/jaccard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [90]:
import pandas as pd
import numpy as np

def jaccard_dist(setA, setB):
  """
  Computes the Jaccard Distance of two sample sets (A and B) which measures 
  dissimilarity between them. It is defined as the difference of the sizes of 
  the union and the intersection of two sets divided by the size of the union 
  of the sets.
  
  JD(a, b) = 1 - (|a_intersect_b|/|a_union_b|)
  
  How to interpret the result:
    -Small if the tweets are similar
    -Large if the tweets are not similar
    -0 if the tweets have the same words (not counting duplicates or ordering)
    -1 if they are completely different (i.e. no overlapping words)
    
  http://en.wikipedia.org/wiki/Jaccard_index
  """
  
  a_union_b = len(setA.union(setB))
  a_intersect_b = len(setA.intersection(setB))
  
  return  1.0 - (a_intersect_b / a_union_b)
 
  
# Testing values and code
setA = set('the long march'.split())
setB = set('ides of march'.split())
test_jaccard_dist = 0.8 

j_dist = jaccard_dist(setA, setB)

if  j_dist == test_jaccard_dist:
  print("It works")
else:
  print("You broke something")
  print(str(j_dist))
  
  
  

It works


In [91]:
import pandas as pd
import numpy as np

# Tweet data location in Google Colab
data_url = 'file://localhost/content/Tweets.json'
tweets = pd.read_json(data_url, lines=True, orient='records')
num_rows = tweets.shape[0]
half_rows = int(num_rows/2)

# Drop all columns except for the tweet text to save memory
tweets = tweets[['text']]

# Split tweet text data using spaces and save to new 'list' column before set
# conversion then apply the set_from_list function to the new list 'column'
tweets['list'] = tweets['text'].str.split()

# Convert the lists of words for tweets into sets
to_set = lambda x: set(x)
tweets['set'] = tweets['list'].apply(to_set)
  
                   
# Remove list column to free up memory
tweets = tweets.drop(['list'], axis=1)  

# Empty 2d NxN numpy array to hold Jaccard Distances
jaccard_values = np.zeros((num_rows, num_rows))

# Calculate Jaccard Distances between each tweeet
# Stop after the middle row, remaining values will be copied since they have
# already been calculated
for i in range(0, half_rows):
  for j in range(0, num_rows):
    if i != j:
      setA = tweets.loc[[i],['set']]['set'][i]
      setB = tweets.loc[[j],['set']]['set'][j]
      
      jaccard_values[i, j] = jaccard_dist(setA, setB)
    else:
      jaccard_values[i, i] = 0

      
# Copy calculated values to the rest of the rows  
for i in range(half_rows, num_rows):
  for j in range(0, num_rows):
    if i != j:
      jaccard_values[i, j] = jaccard_values[j, i]
    else:
      jaccard_values[i, i] = 0
      
        
      
print(jaccard_values)

[[0.         0.17391304 0.17391304 ... 0.97368421 0.97368421 0.96875   ]
 [0.17391304 0.         0.         ... 0.97368421 0.97368421 0.96875   ]
 [0.17391304 0.         0.         ... 0.97368421 0.97368421 0.96875   ]
 ...
 [0.97368421 0.97368421 0.97368421 ... 0.         0.         0.        ]
 [0.97368421 0.97368421 0.97368421 ... 0.         0.         0.        ]
 [0.96875    0.96875    0.96875    ... 0.         0.         0.        ]]
