In [9]:
import pyspark, pickle
from pyspark import SparkContext
from pyspark.sql.functions import countDistinct, regexp_replace, monotonically_increasing_id, lit
from pyspark.storagelevel import StorageLevel
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline

In [10]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# Load tweet, pagerank, and community data. Join.

In [3]:
tweets = spark.read.parquet('tweets_all.parquet')
tweets.persist(StorageLevel.MEMORY_AND_DISK);
tweets.count()

1185967

In [4]:
coms = spark.read.parquet('communities.parquet')
coms.persist(StorageLevel.MEMORY_AND_DISK)
coms.count()

480277

In [5]:
pr = spark.read.parquet('pageranks.parquet')
pr.registerTempTable('pr')

In [6]:
tweets = tweets.join(coms, 'screen_name', 'inner')
tweets.registerTempTable('tweets')
coms.unpersist();

In [7]:
tweets = tweets.join(pr, on='screen_name', how='left')

# See who MrLukeyLuke communicated with

In [22]:
tweets.filter('screen_name = "MrLukeyLuke"').select('text', 'created_at').take(10)

[Row(text="@AdrianHarrop Fellow conservatives (mainly US) concern me on climate change like sjws's do on evolutionary psychology.", created_at='Mon Mar 06 22:20:06 +0000 2017'),
 Row(text='@ErikSolheim @Jack_P_95 Even if climate change is wrong, we ought to take action anyway. Rather be prepared and wrong than vice versa.', created_at='Tue Mar 14 15:46:42 +0000 2017'),
 Row(text="Bill Nye is a science educator, to be fair he's never claimed to be a scientist. I agree with Nye that anthropogenic global warming is real.", created_at='Wed Mar 01 19:44:22 +0000 2017'),
 Row(text='@AdrianHarrop Climate change worries me. I would go for carbon trade, nuclear power and offer tax breaks for renewable energy.', created_at='Mon Mar 06 20:55:26 +0000 2017'),
 Row(text='@swinny198 I agree with that, judging by your profile I think stuff like climate change we might disagree on.', created_at='Fri Mar 10 22:18:27 +0000 2017'),
 Row(text="@AdrianHarrop I was barely taught climate change at school. So

In [12]:
# Luke mostly communicated with other members of community 3, the climate change Deniers

tweets.filter('screen_name in ("ErikSolheim", "AdrianHarrop", "swinny198", "JSlate__", "Jack_P_95")').\
    select('screen_name', 'name', 'community','pagerank', 'retweeted_screen_name').show()

+------------+----------------+---------+--------+---------------------+
| screen_name|            name|community|pagerank|retweeted_screen_name|
+------------+----------------+---------+--------+---------------------+
| ErikSolheim|    Erik Solheim|       10|    null|                 null|
| ErikSolheim|    Erik Solheim|       10|    null|                 null|
| ErikSolheim|    Erik Solheim|       10|    null|                 null|
| ErikSolheim|    Erik Solheim|       10|    null|                 null|
| ErikSolheim|    Erik Solheim|       10|    null|                 null|
|   swinny198|           Sam S|        3|    null|          1markconway|
|   swinny198|           Sam S|        3|    null|      ClimateRealists|
| ErikSolheim|    Erik Solheim|       10|    null|                 null|
| ErikSolheim|    Erik Solheim|       10|    null|                 null|
| ErikSolheim|    Erik Solheim|       10|    null|                 null|
| ErikSolheim|    Erik Solheim|       10|    null| 