# Trend Comparison

This notebook analyzes similarity in trends between pairs of topics.

In [9]:
import sys
import os
import pyspark
import math
from operator import add

## Load Dataset

In [17]:
# data = load_data()
dataRDD = sc.parallelize([['topic1', [0, 0], 1995, 1, 1],
                          ['topic1', [0, 0], 1995, 1, 3],
                          ['topic1', [0, 0], 1996, 1, 3],
                          ['topic2', [0, 0], 1995, 1, 3]
                         ])

for line in dataRDD.collect():
    print 'Topic: {}. Tag: {},{}. Year: {}. Page count: {}. Volume count: {}'.format(
        line[0], line[1][0], line[1][1], line[2], line[3], line[4])

Topic: topic1. Tag: 0,0. Year: 1995. Page count: 1. Volume count: 1
Topic: topic1. Tag: 0,0. Year: 1995. Page count: 1. Volume count: 3
Topic: topic1. Tag: 0,0. Year: 1996. Page count: 1. Volume count: 3
Topic: topic2. Tag: 0,0. Year: 1995. Page count: 1. Volume count: 3


## Volume Frequency over Years

In [23]:
def volume_freq(dataRDD):
    topic_yr_to_nvol = (dataRDD
                        .map(lambda x: ((x[0], x[2]), x[4]))
                        .reduceByKey(add)
                       )
    return topic_yr_to_nvol

volFreqRDD = volume_freq(dataRDD)
print volFreqRDD.collect()

[(('topic1', 1995), 4), (('topic2', 1995), 3), (('topic1', 1996), 3)]


In [33]:
def volume_freq_trend(dataRDD):
    topic_to_yr_vol = (dataRDD
                       .map(lambda x: (x[0][0], {x[0][1]: x[1]}))
                       .groupByKey()                       
                      )
    return topic_to_yr_vol

volTrendRDD = volume_freq_trend(volFreqRDD)
print volTrendRDD.collect()

[('topic1', <pyspark.resultiterable.ResultIterable object at 0xb0ecf36c>), ('topic2', <pyspark.resultiterable.ResultIterable object at 0xb0ecf38c>)]


## Similarity in Volume Frequency between Topic Pairs

In [None]:
def distance_l1(x, y):
    """
    Input: x, y - dictionary
    Output: L1-distance between x, y
    """
    dist = 0
    keys = set(x).union(set(y))
    for k in keys:
        v1 = 0 if k not in x else x[k]
        v2 = 0 if k not in y else y[k]
        dist += abs(v1 - v2)
    return dist * 1.0 / len(keys)

def distance_l2(x, y):
    """
    Input: x, y - dictionary
    Output: L2-distance between x, y
    """
    dist = 0
    keys = set(x).union(set(y))
    for k in keys:
        v1 = 0 if k not in x else x[k]
        v2 = 0 if k not in y else y[k]
        dist += (v1 - v2) * (v1 - v2)
    return math.sqrt(dist * 1.0 / len(keys))

def dot_product(x, y):
    res = 0
    for k in x:
        if k in y:
            res += x[k] * y[k]
    return res
        

def cosine_similarity(x, y):
    """
    Input: x, y - dictionary
    Output: Cosine similarity between x, y
    """
    norm_x = math.sqrt(dot_product(x, x))
    norm_y = math.sqrt(dot_product(y, y))
    return dot_product(x, y) * 1.0 / norm_x / norm_y

In [None]:
def pair_distance_l1(dataRDD):
    pass

## Visualization