In [5]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    '--jars lib/graphframes-release-0-5-0-assembly-0.5.0-spark2.1.jar pyspark-shell')
from pyspark.sql import SparkSession
from lib.graphframes import *
from pyspark.sql.functions import col, size, lit, collect_list

spark = SparkSession \
    .builder \
    .appName("RelaTree") \
    .getOrCreate()

# group_members_data = spark.read.csv("data/group_members.csv", header="true").drop("update_time", "is_admin")
# group_channel_data = spark.read.csv("data/group_channel.csv", header="true")

In [6]:
# Vertex DataFrame
v = spark.createDataFrame([
    ("a", "Alice", 34),
    ("b", "Bob", 36),
    ("c", "Charlie", 30),
    ("d", "David", 29),
    ("e", "Esther", 32),
    ("f", "Fanny", 36),
    ("g", "Gabby", 60)
], ["id", "name", "age"])
# Edge DataFrame
e = spark.createDataFrame([
    ("a", "b", "friend"),
    ("b", "c", "follow"),
    ("c", "b", "follow"),
    ("f", "c", "follow"),
    ("e", "f", "follow"),
    ("e", "d", "friend"),
    ("d", "a", "friend"),
    ("a", "e", "friend")
], ["src", "dst", "relationship"])
# Create a GraphFrame
g = GraphFrame(v, e)
g.vertices.show()
g.edges.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
|  d|  David| 29|
|  e| Esther| 32|
|  f|  Fanny| 36|
|  g|  Gabby| 60|
+---+-------+---+

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
|  f|  c|      follow|
|  e|  f|      follow|
|  e|  d|      friend|
|  d|  a|      friend|
|  a|  e|      friend|
+---+---+------------+



In [6]:
g.vertices.write.parquet('store/gv.parquet')
g.edges.write.parquet('store/ge.parquet')

agv = spark.read.parquet('store/gv.parquet')
age = spark.read.parquet('store/ge.parquet')

ag = GraphFrame(agv, age)
ag.vertices.show()
ag.edges.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  c|Charlie| 30|
|  e| Esther| 32|
|  a|  Alice| 34|
|  d|  David| 29|
|  f|  Fanny| 36|
|  g|  Gabby| 60|
|  b|    Bob| 36|
+---+-------+---+

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
|  f|  c|      follow|
|  e|  f|      follow|
|  e|  d|      friend|
|  d|  a|      friend|
|  a|  e|      friend|
+---+---+------------+



In [18]:
import pandas as pd

data = pd.read_csv('data/group_members.csv')
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,create_time,update_time,group_id,user_id,is_admin,added_by
0,2016-06-25 12:12:58.776567,2016-06-25 12:12:58.776567,c8a5ca5947cc7d9251b808b7aad0e75fac381823,fcaa201f972dfc42809e83de776fd36aa1577761,f,
1,2018-03-26 18:30:02.531757,2018-03-26 18:30:02.531757,d2c8410bb78af46155aaa96b50b082598ca69306,26cf4a8044866c1bca5f54dea9632180ac37e6d6,f,
2,2017-12-18 10:23:26.487894,2017-12-18 10:23:26.487894,2c07f080d9f041295761260edf28275a872e65e2,570a2ddfdccc2b7754dfc1052923217a8e5fd2df,f,
3,2018-02-08 16:58:31.057972,2018-02-08 16:58:31.057972,1430c9033779d7585789131f4d287743c9163e87,df8d3021822ef6fed29011bc254c5d82d99222dd,f,
4,2018-03-26 18:30:02.531757,2018-03-26 18:30:02.531757,d2c8410bb78af46155aaa96b50b082598ca69306,777862a1db383bcd7ea66d9f80424f98a7c9ef9f,f,


In [20]:
data['group_id'].value_counts()

245e6a1ec240325b7f45dcde02397032cc437328    328
8d6224b3c39fe19d099e5971d0880afe58b2128a    314
d844058c0e39332000fbdc09c44143fbc747cb2a    310
03d9e87c037548b7d57ba779a6be2aa2e52f9b81    307
42d44d92b4fdf974945f80f24bb8dae6ef1576dd    306
3123dfc22b65f25f130908a4bae441059d44b877    305
b30b84ca1fd134b50391f77aef3aecead032448c    300
116854cc62a1076a7e4e4c350e33e82a9d7c0af6    298
9a08c348b64b0219619a7b25a4f5daf4b1109ef9    296
f06b507b5bfb551146eb05f6569eba549c736f1b    290
17167e5bc70ad7cad465b2a35a32bf897766ecaf    287
e7b932eba610b29960fcc9d92430266c2cae2c66    286
638486a29c7681626325c084c5651d9ac05a33bb    286
9e9ade2e42acfdce5cc9594e1e6bff8fcb4d6f97    284
278ebd55f0d450e64f593a7c3e7938fc9c335e42    284
f425bba48f3d2bd32de53b13f23eb93ef789b3ea    283
d2a6b24e6d91ff03d41fd359267c53c9fea05879    281
f436c7333ba9d18722bb0c3d8d717dc12bb5f4af    281
22673e95c2c7a6625124f088a7133a62b6cebf5c    281
0103d399a4d343cce43eb1b737413170c72633ee    279
4890ad9e355747c5c9dcb35e23645979a7186a46

In [None]:
c141ef4d309b251fff0d46dc9e70a8f37f8d83dd

In [21]:
cdata = pd.read_csv('data/group_channel.csv')

In [22]:
cdata.head()

Unnamed: 0,group_id,channel_id
0,768808012f4a55a3c4b2ff469dd8231cd935ebac,football
1,c2de90421fafd6c9b14deaaf2e30ec72797cb2b4,football
2,c42dd8dcc1ba6469e5590ee7563438f1cf83a14d,promo
3,baccbd59003ffd22d77b4f11a91c581ffa4ee075,basketball
4,3aa92e503fbdd905c19d2f2c99474b79307139bc,olympics


In [26]:
import numpy as np
np.where(cdata['group_id'] == 'c141ef4d309b251fff0d46dc9e70a8f37f8d83dd')

(array([6897257, 6899035], dtype=int64),)

In [29]:
cdata.iloc[6899035,:]

group_id      c141ef4d309b251fff0d46dc9e70a8f37f8d83dd
channel_id                                     cricket
Name: 6899035, dtype: object

In [None]:
# Import libraries 
import graphframe as GF
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, size, lit, collect_list
import operator






def graph_recommender(graph,channels,k):
    recommendations = {}
    for channel,weight in channels.iteritems():
        channel_motif = graph.find("(v1)-[e]->(v2)").filter("v1.id == '"+channel+"'")
        direct_neighbors = channel_motif.select("v2.id","e.weight").collect()
        for row in direct_neighbors:
            recommendations[row.id]=recommendations.get(row.id,0)+(row.weight*weight)
    return (sorted(recommendations.items(), key=operator.itemgetter(1),reverse=True))





