# Load data

In [13]:
amazon = sqlContext.read.format('csv'). \
    .option("inferSchema",True). \
    .option("header", True). \
    .option("delimiter","\t"). \
    .load("com-amazon.ungraph.txt")

In [116]:
amazon.limit(5).toPandas()

Unnamed: 0,source,target
0,1,88160
1,1,118052
2,1,161555
3,1,244916
4,1,346495


# Generate Vertex and Edge table

In [37]:
e = amazon

In [16]:
v = amazon[['source']].union(amazon[['target']]).distinct()

## Rename column names into GraphFrame acceptable column names

In [39]:
e = e.withColumnRenamed('source','src').withColumnRenamed('target','dst')

In [None]:
v = v.withColumnRenamed('source','id')

## Use GraphFrame to manipulate data 

In [40]:
from graphframes import *
g = GraphFrame(v, e)

## Create In Degree Table 

In [102]:
InDegreeTable = g.inDegrees.orderBy("inDegree", ascending = False)

In [104]:
#write into csv so that Gephi can read
InDegreeTable.coalesce(1).write.format('com.databricks.spark.csv').save('InDegreeTable.csv',header = 'true')

### Find the most frequent co-occuring product 

In [63]:
most_popular_1 = g.inDegrees.orderBy("inDegree", ascending = False).limit(1)

In [101]:
most_popular_1.show()

+------+--------+
|    id|inDegree|
+------+--------+
|548091|     571|
+------+--------+



### Find co-occuring products of the most frequent product 

In [76]:
new_data_1 = e.filter(e.src.isin(['548091']) | e.dst.isin(['548091']))

In [91]:
list1 = [int(i.src) for i in new_data_1.collect()]
list2 = [int(i.dst) for i in new_data_1.collect()]

In [93]:
new_data_2 = e.filter(e.src.isin(list1) | e.src.isin(list2) | e.dst.isin(list1) | e.dst.isin(list2))

In [90]:
new_data_2.coalesce(1).write.format('com.databricks.spark.csv').save('new_data_2.csv',header = 'true')

### Find co-occuring products of the most frequent product

In [96]:
list3 =  [int(i.src) for i in new_data_2.collect()]
list4 =  [int(i.dst) for i in new_data_2.collect()]

In [98]:
new_data_3 = e.filter(e.src.isin(list3) | e.src.isin(list3) | e.dst.isin(list4) | e.dst.isin(list4))

In [99]:
new_data_3 = new_data_3.withColumnRenamed('src','source').withColumnRenamed('dst','target')

In [100]:
new_data_3.coalesce(1).write.format('com.databricks.spark.csv').save('new_data_3.csv',header = 'true')