In [1]:
from pyspark.sql import Row
from pyspark.sql.functions import monotonically_increasing_id
import json

In [37]:
import pandas as pd

In [2]:
sc = spark.sparkContext

### Read txtFiles

In [3]:
lines = sc.textFile("s3a://meetupstream/2017/*/*/*/*")

In [4]:
parts = lines.map(lambda l: l.split("\n"))

In [5]:
json_rdd=parts.map(lambda l: Row(rsvp = eval(json.loads(l[0]))))

In [6]:
data = spark.createDataFrame(json_rdd)

In [7]:
data.printSchema()

root
 |-- rsvp: map (nullable = true)
 |    |-- key: string
 |    |-- value: map (valueContainsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)



In [8]:
data.count()

1224415

### Filter to Data Science groups only

In [9]:
DS_data = data.filter(data['rsvp']['group']['group_topics'].like('%ata%cience%'))

In [10]:
DS_data=DS_data.select(monotonically_increasing_id().alias('row_id'), DS_data['rsvp'])

In [11]:
DS_data.head(1)

[Row(row_id=0, rsvp={u'group': {u'group_city': u'Oslo', u'group_lat': u'59.91', u'group_urlname': u'Oslo-Data-Science', u'group_name': u'Oslo Data Science Meetup', u'group_lon': u'10.75', u'group_topics': u'[{urlkey=data-science, topic_name=Data Science}, {urlkey=machine-learning, topic_name=Machine Learning}, {urlkey=data-analysis-and-modeling, topic_name=Data Analysis and Modeling}, {urlkey=statistical-computing, topic_name=Statistical Computing}, {urlkey=data-analytics, topic_name=Data Analytics}, {urlkey=data-visualization, topic_name=Data Visualization}, {urlkey=predictive-analytics, topic_name=Predictive Analytics}, {urlkey=applied-statistics, topic_name=Applied Statistics}, {urlkey=ai, topic_name=Artificial Intelligence}]', u'group_id': u'18698284', u'group_country': u'no'}, u'rsvp_id': None, u'venue': {u'lat': u'59.92345', u'venue_id': u'17454962', u'lon': u'10.73179', u'venue_name': u'Teknologihuset'}, u'visibility': None, u'event': {u'event_name': u'Creating and testing recom

In [12]:
DS_data.count()

30802

### Create Normal tables

In [48]:
#venue
DS_data.select(DS_data['rsvp']['venue']['venue_id'].alias('venue_id'),
               DS_data['rsvp']['venue']['lat'].alias('venue_lat'),
               DS_data['rsvp']['venue']['lon'].alias('venue_lon'),
               DS_data['rsvp']['venue']['venue_name'].alias('venue_name'),
               DS_data['rsvp']['venue']['visibility'].alias('venue_visibility')).head(2)

[Row(venue_id=u'17454962', venue_lat=u'59.92345', venue_lon=u'10.73179', venue_name=u'Teknologihuset', venue_visibility=None),
 Row(venue_id=u'24578702', venue_lat=u'33.84705', venue_lon=u'-84.357201', venue_name=u'RentPath, Inc.', venue_visibility=None)]

In [49]:
#venue
venue_DF=DS_data.select(DS_data['rsvp']['venue']['venue_id'].alias('venue_id'),
               DS_data['rsvp']['venue']['lat'].alias('venue_lat'),
               DS_data['rsvp']['venue']['lon'].alias('venue_lon'),
               DS_data['rsvp']['venue']['venue_name'].alias('venue_name'),
               DS_data['rsvp']['venue']['visibility'].alias('venue_visibility')).distinct()

In [54]:
event_DF = DS_data.select(DS_data['rsvp']['event']['event_id'].alias('event_id'),
                          DS_data['rsvp']['event']['event_name'].alias('event_name'),
                          DS_data['rsvp']['event']['event_url'].alias('event_url'),
                          DS_data['rsvp']['event']['time'].alias('event_time')).distinct()

In [55]:
member_DF = DS_data.select(DS_data['rsvp']['member']['member_id'].alias('member_id'),
               DS_data['rsvp']['member']['member_name'].alias('member_name'),
                DS_data['rsvp']['member']['photo'].alias('member_photo')).distinct()

In [56]:
group_DF = DS_data.select(DS_data['rsvp']['group']['group_id'].alias('group_id'),
                          DS_data['rsvp']['group']['group_country'].alias('group_country'),
                          DS_data['rsvp']['group']['group_city'].alias('group_city'),
                          DS_data['rsvp']['group']['group_lat'].alias('group_lat'),
                          DS_data['rsvp']['group']['group_long'].alias('group_long'),
                          DS_data['rsvp']['group']['group_urlname'].alias('group_urlname')).distinct()

In [57]:
rsvp_DF = DS_data.select(DS_data['row_id'].alias('row_id'), 
                         DS_data['rsvp']['venue']['venue_id'].alias('venue_id'),
                         DS_data['rsvp']['event']['event_id'].alias('event_id'),
                         DS_data['rsvp']['member']['member_id'].alias('member_id'),
                         DS_data['rsvp']['group']['group_id'].alias('group_id'),
                         DS_data['rsvp']['rsvp_id'].alias('rsvp_id'),
                         DS_data['rsvp']['guests'].alias('guests'),
                         DS_data['rsvp']['mtime'].alias('mtime'),
                         DS_data['rsvp']['response'].alias('response'))

### Export to S3

In [None]:
venue_DF.write.parquet('s3a://meetupstream/csv_df/venue_Df/')
group_DF.write.parquet('s3a://meetupstream/csv_df/group_DF/')
event_DF.write.parquet('s3a://meetupstream/csv_df/event_DF/')
member_DF.write.parquet('s3a://meetupstream/csv_df/member_DF/')
rsvp_DF.write.parquet('s3a://meetupstream/csv_df/rsvp_DF/')


### Write Tables

In [68]:
venue_DF.toPandas().to_csv('venue.csv', encoding = 'utf-8')

In [60]:
event_DF.toPandas().to_csv('event.csv', encoding = 'utf-8')

In [69]:
member_DF.toPandas().to_csv('member.csv', encoding = 'utf-8')

In [63]:
group_DF.toPandas().to_csv('group.csv', encoding = 'utf-8')

In [64]:
rsvp_DF.toPandas().to_csv('rsvp.csv', encoding = 'utf-8')