In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import py2neo
from py2neo import Graph
from dask.delayed import delayed
import dask.dataframe as dd
import gzip

In [7]:
graph = Graph('event_db/import/installation-3.4.6', username= 'neo4j')

In [110]:
graph.delete_all()

In [9]:
# Load user data for node creation
user_df = pd.read_csv('data/users.csv')

In [15]:
#Drop joinedAt and timezone columns
user_df.drop(columns = ['joinedAt', 'timezone'], inplace = True)

In [41]:
#Drop nan values
user_df.dropna(inplace = True)

In [111]:
user_df.to_csv('users1.csv')

In [117]:
user_df.head(3)

Unnamed: 0,user_id,locale,birthyear,gender,location
0,3197468391,id_ID,1993,male,Medan Indonesia
1,3537982273,id_ID,1992,male,Medan Indonesia
2,823183725,en_US,1975,male,Stratford Ontario


In [45]:
with open('data/user_friends.csv', 'rb') as fd:
    gzip_fd = gzip.GzipFile(fileobj=fd)
    user_friends_df = pd.read_csv(gzip_fd)

#  = pd.read_csv('data/user_friends.csv', encoding = 'latin1')

In [122]:
user_friends_df.shape

(38202, 2)

In [121]:
user_friends_df['friends'].loc[1]

'1491560444 395798035 2036380346 899375619 3534826887 3427911581 494959696 115160877 746794312 325618967 3583230607 2094337035 198048438 186874661 3283846039 2128147184 989205343 1697243456 283376872 590194637 449541620 3790353461 1066306271 2440877046 503896214 2587298010 1982758269 537263304 927472977 1927052839 1132903380 86285457 4036431152 1791447010 18612222 3151888497 1971360914 406305331 2087892244 842576019 134129040 3796792653 2574528347 1587209129 561932198 1922857397 1571185593 1414184012 159091864 1042052517 2288947952 1600924328 2695632233 3041966597 279762713 647604441 602353163 2818828362 3112065941 2488279876 3809565975 2766572099 2183474049 3368683780 3230597814 2007685957 3697792584 1241128725 3910047856 3917095806 105763062 3974068830 1965219420 1191778350 2075045652 3411069781 697513845 694484811 3750367050 1272486943 452723286 658519423 166363290 801516287 44410701 3287381039 975121372 1737143301 319597365 849210525 664174560 2545854189 1340246139 2543481943 35011

In [112]:
user_friends_df.to_csv('user_friends1.csv')

In [48]:
with open('data/events.csv', 'rb') as ed:
    gzip_ed = gzip.GzipFile(fileobj=ed)
    events_df = pd.read_csv(gzip_ed)

In [56]:
events_df.drop(columns = ['zip'], inplace = True)
events_df.dropna(inplace = True)

In [63]:
events_df = events_df[['event_id', 'user_id', 'city', 'state', 'country', 'lat', 'lng']]

In [113]:
events_df.to_csv('events1.csv')

In [118]:
events_df.head(3)

Unnamed: 0,event_id,user_id,city,state,country,lat,lng
40,2587616435,3053278014,Sihanoukville,Kampot,Cambodia,10.633,103.5
51,1145166049,2509951454,Palo Alto,CA,United States,37.442,-122.172
156,3580637647,1501747205,Los Angeles,CA,United States,3.156,101.612


In [70]:
with open('data/event_attendees.csv', 'rb') as ead:
    gzip_ead = gzip.GzipFile(fileobj=ead)
    event_attendees_df = pd.read_csv(gzip_ead)

In [73]:
event_attendees_df.shape

(24144, 5)

In [74]:
event_attendees_df.dropna(inplace = True)

In [114]:
event_attendees_df.to_csv('event_attendees1.csv')

In [116]:
event_attendees_df.head(3)

Unnamed: 0,event,yes,maybe,invited,no
0,1159822043,1975964455 252302513 4226086795 3805886383 142...,2733420590 517546982 1350834692 532087573 5831...,1723091036 3795873583 4109144917 3560622906 31...,3575574655 1077296663
4,855842686,2406118796 3550897984 294255260 1125817077 109...,2671721559 1761448345 2356975806 2666669465 10...,1518670705 880919237 2326414227 2673818347 332...,3500235232
6,488116622,4145960786 2550625355 2577667841 1575121941 28...,1227223575 2789471603 1323321680 3086272918 38...,1413359297 2300232602 1412759254 617751520 286...,1498160155 3708150269 823488244 3595018395 173...


In [None]:
###### Create the nodes relative to Users, each one being identified by its user_id #####
# "MERGE" request : creates a new node if it does not exist already
tx = graph.begin()
statement1 = 'CREATE CONSTRAINT ON (u:User) ASSERT u.user_id IS UNIQUE'
tx.evaluate(statement1)
statement2 = '''USING PERIODIC COMMIT 
MERGE (u:`User`{user_id:{A}, location:{B}}) RETURN u'''
for u, row in user_df.iterrows():
    tx.evaluate(statement2, {"A":row.loc['user_id'], "B": row.loc['location']})
tx.commit()

In [None]:
###### Create the nodes relative to Events, each one being identified by its event_id #####
# "MERGE" request : creates a new node if it does not exist already
tx = graph.begin()
statement1 = 'CREATE CONSTRAINT ON (e:Event) ASSERT e.user_id IS UNIQUE'
tx.evaluate(statement1)
statement2 = '''USING PERIODIC COMMIT 
MERGE (u:`User`{user_id:{A}, location:{B}}) RETURN u'''
for u, row in user_df.iterrows():
    tx.evaluate(statement2, {"A":row.loc['user_id'], "B": row.loc['location']})
tx.commit()