# Heading 

In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
from IPython.core.display import display, HTML
import pandas as pd
%reload_ext autoreload
%autoreload 1
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth',100)    

display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from itertools import combinations, takewhile
import collections

from simhash import Simhash, SimhashIndex

sns.set()

print(sys.version)

3.6.8 |Anaconda custom (64-bit)| (default, Dec 30 2018, 01:22:34) 
[GCC 7.3.0]


In [3]:
print(spark.version)

2.4.0-cdh6.1.0


In [4]:
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
# from pyspark.ml.feature import OneHotEncoderEstimator
# OneHotEncoderEstimator is available starting from Spark 2.3
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.sql.functions import *
from pyspark.sql.types import *

In [5]:
!hdfs dfs -ls -h '/user/ivy2/Tweets/' > '/home/sriharis/git_projects/BigDataEngg/final_project/file_list.txt'
tweets_path = '/user/ivy2/Tweets/'



### Some helper functions

In [6]:
def add_item_to_list(arr, val, unique=False):
    if val in arr:
        if not unique:
            return arr.extend([val])
    if val not in arr:
        return arr.extend([val])

In [7]:
all_files = []

def read_all_lines(fname):
    with open(fname) as f:
        content = f.readlines()
        for line in content:
            start_loc = line.find('/user/ivy2/Tweets/')
            if start_loc < 0:
                continue
            all_files.append(line[start_loc:].strip())
    
read_all_lines('./file_list.txt')    

In [8]:
all_files[0]

'/user/ivy2/Tweets/tweets201706221015.json'

#### Placeholder_variables

In [9]:
all_hashtags = {
    "uchicago": [],
    "upenn": []
}

uc_favored_tags = ["uchicago", "uchearing", "universityofchicago", "pritzkerschoolofmedicine",
                   "uofc", "maroonmade", "chicagobooth"]

upenn_favored_tags = ["upenn", "penn", "uofpenn", "universityofpennsylvania", "pennlaw"]

nw_favored_tags = [""]

ucla_favored_tags = ["uic"]

In [10]:
fields_to_keep = ["id_str", 
                  "text",
                  "in_reply_to_status_id_str",
                  "in_reply_to_user_id_str", 
                  "created_at",
                  # User columns
                  "user.id_str",
                  "user.name",
                  "user.followers_count",
                  "user.favourites_count",
                  "user.statuses_count",
                  "user.friends_count",
                  # Other attributes
                  "coordinates",
                  "favorite_count",
                  "entities.hashtags",
                  "favorited", 
                  "place.country",
                  "place.country_code",
                  "place.name",
                  "place.place_type",
                  # Retweet columns
                  "retweet_count", 
                  "retweeted",
                  "retweeted_status.user.id_str",
                  "retweeted_status.user.name"
                 ]

In [11]:
all_files[0]

'/user/ivy2/Tweets/tweets201706221015.json'

## Get all the relevant hashtags and populate a dataframe 

In [12]:
tweets_df = None

counter = 1300

fixed_col_names = [
    "id_str", 
    "text",
    "in_reply_to_status_id_str",
    "in_reply_to_user_id_str", 
    "created_at",
    # User columns
    "user_id_str",
    "user_name",
    "user_followers_count",
    "user_favourites_count",
    "user_statuses_count",
    "user_friends_count",
    # Other attributes
    "coordinates",
    "favorite_count",
    "entities_hashtags",
    "favorited", 
    "place_country",
    "place_country_code",
    "place_name",
    "place_place_type",
    # Retweet columns
    "retweet_count", 
    "retweeted",
    "retweeted_status_user_id_str",
    "retweeted_status_user_name"
]


for file in all_files:
    print(file)
    df = spark.read.json('hdfs://'+file)
    tmp_df = df.select(fields_to_keep).toPandas()
    tmp_df.columns = fixed_col_names
    
    def clean_hashtags(row):
        if row is None:
            return []
        if row is np.NaN:
            return []
        if (len(row) == 0):
            return []
        tags = []
        for item in row:
            tags.append(item.text)
        return tags
    
    tmp_df["hashtags_cleaned"] = tmp_df["entities_hashtags"].apply(clean_hashtags)
    
    # Add a date column by parsing the file name
    s = '/tweets'
    l = file.find(s)
    timestamp = file[l + len(s):-5]
    tmp_df["scraped_timestamp"] = timestamp
    
    if tweets_df is None:
        tweets_df = pd.DataFrame(columns=tmp_df.columns)
        
    tweets_df = tweets_df.append(tmp_df, ignore_index=True)

    counter -= 1
    if counter <= 0:
        break

/user/ivy2/Tweets/tweets201706221015.json
/user/ivy2/Tweets/tweets201706221115.json
/user/ivy2/Tweets/tweets201706221215.json
/user/ivy2/Tweets/tweets201706221315.json
/user/ivy2/Tweets/tweets201706221415.json
/user/ivy2/Tweets/tweets201706221515.json
/user/ivy2/Tweets/tweets201706221615.json
/user/ivy2/Tweets/tweets201706221715.json
/user/ivy2/Tweets/tweets201706221815.json
/user/ivy2/Tweets/tweets201706221915.json
/user/ivy2/Tweets/tweets201706222015.json
/user/ivy2/Tweets/tweets201706222115.json
/user/ivy2/Tweets/tweets201706222215.json
/user/ivy2/Tweets/tweets201706222315.json
/user/ivy2/Tweets/tweets201706230015.json
/user/ivy2/Tweets/tweets201706230115.json
/user/ivy2/Tweets/tweets201706230215.json
/user/ivy2/Tweets/tweets201706230315.json
/user/ivy2/Tweets/tweets201706230415.json
/user/ivy2/Tweets/tweets201706230515.json
/user/ivy2/Tweets/tweets201706230615.json
/user/ivy2/Tweets/tweets201706230715.json
/user/ivy2/Tweets/tweets201706230815.json
/user/ivy2/Tweets/tweets2017062309

/user/ivy2/Tweets/tweets201706301615.json
/user/ivy2/Tweets/tweets201706301715.json
/user/ivy2/Tweets/tweets201706301815.json
/user/ivy2/Tweets/tweets201706301915.json
/user/ivy2/Tweets/tweets201706302015.json
/user/ivy2/Tweets/tweets201706302115.json
/user/ivy2/Tweets/tweets201706302215.json
/user/ivy2/Tweets/tweets201706302315.json
/user/ivy2/Tweets/tweets201707010015.json
/user/ivy2/Tweets/tweets201707010115.json
/user/ivy2/Tweets/tweets201707010215.json
/user/ivy2/Tweets/tweets201707010315.json
/user/ivy2/Tweets/tweets201707010415.json
/user/ivy2/Tweets/tweets201707010515.json
/user/ivy2/Tweets/tweets201707010615.json
/user/ivy2/Tweets/tweets201707010715.json
/user/ivy2/Tweets/tweets201707010815.json
/user/ivy2/Tweets/tweets201707010915.json
/user/ivy2/Tweets/tweets201707011015.json
/user/ivy2/Tweets/tweets201707011115.json
/user/ivy2/Tweets/tweets201707011215.json
/user/ivy2/Tweets/tweets201707011315.json
/user/ivy2/Tweets/tweets201707011415.json
/user/ivy2/Tweets/tweets2017070115

/user/ivy2/Tweets/tweets201707082115.json
/user/ivy2/Tweets/tweets201707082215.json
/user/ivy2/Tweets/tweets201707082315.json
/user/ivy2/Tweets/tweets201707090015.json
/user/ivy2/Tweets/tweets201707090115.json
/user/ivy2/Tweets/tweets201707090215.json
/user/ivy2/Tweets/tweets201707090315.json
/user/ivy2/Tweets/tweets201707090415.json
/user/ivy2/Tweets/tweets201707090515.json
/user/ivy2/Tweets/tweets201707090615.json
/user/ivy2/Tweets/tweets201707090715.json
/user/ivy2/Tweets/tweets201707090815.json
/user/ivy2/Tweets/tweets201707090915.json
/user/ivy2/Tweets/tweets201707091015.json
/user/ivy2/Tweets/tweets201707091115.json
/user/ivy2/Tweets/tweets201707091215.json
/user/ivy2/Tweets/tweets201707091315.json
/user/ivy2/Tweets/tweets201707091415.json
/user/ivy2/Tweets/tweets201707091515.json
/user/ivy2/Tweets/tweets201707091615.json
/user/ivy2/Tweets/tweets201707091715.json
/user/ivy2/Tweets/tweets201707091815.json
/user/ivy2/Tweets/tweets201707091915.json
/user/ivy2/Tweets/tweets2017070920

/user/ivy2/Tweets/tweets201707170115.json
/user/ivy2/Tweets/tweets201707170215.json
/user/ivy2/Tweets/tweets201707170315.json
/user/ivy2/Tweets/tweets201707170415.json
/user/ivy2/Tweets/tweets201707170515.json
/user/ivy2/Tweets/tweets201707170615.json
/user/ivy2/Tweets/tweets201707170715.json
/user/ivy2/Tweets/tweets201707170815.json
/user/ivy2/Tweets/tweets201707170915.json
/user/ivy2/Tweets/tweets201707171115.json
/user/ivy2/Tweets/tweets201707171215.json
/user/ivy2/Tweets/tweets201707171315.json
/user/ivy2/Tweets/tweets201707171415.json
/user/ivy2/Tweets/tweets201707171515.json
/user/ivy2/Tweets/tweets201707171715.json
/user/ivy2/Tweets/tweets201707171815.json
/user/ivy2/Tweets/tweets201707171915.json
/user/ivy2/Tweets/tweets201707172015.json
/user/ivy2/Tweets/tweets201707172115.json
/user/ivy2/Tweets/tweets201707172215.json
/user/ivy2/Tweets/tweets201707172315.json
/user/ivy2/Tweets/tweets201707180015.json
/user/ivy2/Tweets/tweets201707180115.json
/user/ivy2/Tweets/tweets2017071802

/user/ivy2/Tweets/tweets201707250715.json
/user/ivy2/Tweets/tweets201707250815.json
/user/ivy2/Tweets/tweets201707250915.json
/user/ivy2/Tweets/tweets201707251015.json
/user/ivy2/Tweets/tweets201707251115.json
/user/ivy2/Tweets/tweets201707251215.json
/user/ivy2/Tweets/tweets201707251315.json
/user/ivy2/Tweets/tweets201707251415.json
/user/ivy2/Tweets/tweets201707251515.json
/user/ivy2/Tweets/tweets201707251615.json
/user/ivy2/Tweets/tweets201707251715.json
/user/ivy2/Tweets/tweets201707251815.json
/user/ivy2/Tweets/tweets201707251915.json
/user/ivy2/Tweets/tweets201707252015.json
/user/ivy2/Tweets/tweets201707252115.json
/user/ivy2/Tweets/tweets201707252215.json
/user/ivy2/Tweets/tweets201707252315.json
/user/ivy2/Tweets/tweets201707260115.json
/user/ivy2/Tweets/tweets201707260215.json
/user/ivy2/Tweets/tweets201707260315.json
/user/ivy2/Tweets/tweets201707260515.json
/user/ivy2/Tweets/tweets201707260615.json
/user/ivy2/Tweets/tweets201707260715.json
/user/ivy2/Tweets/tweets2017072608

/user/ivy2/Tweets/tweets201708021715.json
/user/ivy2/Tweets/tweets201708021815.json
/user/ivy2/Tweets/tweets201708021915.json
/user/ivy2/Tweets/tweets201708022015.json
/user/ivy2/Tweets/tweets201708022115.json
/user/ivy2/Tweets/tweets201708022215.json
/user/ivy2/Tweets/tweets201708022315.json
/user/ivy2/Tweets/tweets201708030015.json
/user/ivy2/Tweets/tweets201708030115.json
/user/ivy2/Tweets/tweets201708030215.json
/user/ivy2/Tweets/tweets201708030315.json
/user/ivy2/Tweets/tweets201708030415.json
/user/ivy2/Tweets/tweets201708030515.json
/user/ivy2/Tweets/tweets201708030615.json
/user/ivy2/Tweets/tweets201708030715.json
/user/ivy2/Tweets/tweets201708030815.json
/user/ivy2/Tweets/tweets201708030915.json
/user/ivy2/Tweets/tweets201708031015.json
/user/ivy2/Tweets/tweets201708031115.json
/user/ivy2/Tweets/tweets201708031215.json
/user/ivy2/Tweets/tweets201708031315.json
/user/ivy2/Tweets/tweets201708031415.json
/user/ivy2/Tweets/tweets201708031515.json
/user/ivy2/Tweets/tweets2017080316

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving


/user/ivy2/Tweets/tweets201708101315.json


Py4JError: An error occurred while calling o60.read

In [13]:
display(tweets_df.shape)

(9195135, 25)

In [15]:
tweets_df.to_csv(os.path.join(os.getcwd(), "temp.csv"))

In [16]:
os.listdir(os.getcwd())

['Final_countdown_tododooodooo.ipynb',
 'temp.csv',
 '.ipynb_checkpoints',
 'file_list.txt']

In [17]:
!ls -l -h ./

total 2.8G
-rw-r--r-- 1 sriharis sriharis 1.3M Mar 21 08:03 file_list.txt
-rw-r--r-- 1 sriharis sriharis 181K Mar 21 12:02 Final_countdown_tododooodooo.ipynb
-rw-r--r-- 1 sriharis sriharis 2.8G Mar 21 12:04 temp.csv


In [14]:
!ls -l -h ./

total 236M
-rw-r--r-- 1 sriharis sriharis 1.3M Mar 21 08:03 file_list.txt
-rw-r--r-- 1 sriharis sriharis 173K Mar 21 12:00 Final_countdown_tododooodooo.ipynb
-rw-r--r-- 1 sriharis sriharis 235M Mar 20 18:02 temp.csv


In [None]:
with open(os.path.join(os.getcwd(), "temp.csv")) as f:
    x = sum(1 for line in f)
    print(x)

In [6]:
hola = pd.read_csv(os.path.join(os.getcwd(), "temp.csv"), index_col="Unnamed: 0")
display(
    hola.shape,
    hola.head()
)

(1048947, 20)

Unnamed: 0,id_str,text,in_reply_to_status_id_str,in_reply_to_user_id_str,created_at,user_id_str,user_name,user_followers_count,user_favourites_count,user_statuses_count,user_friends_count,coordinates,favorite_count,entities_hashtags,favorited,retweet_count,retweeted,retweeted_status_user_id_str,retweeted_status_user_name,hashtags_cleaned
0,8.778927e+17,RT @ArkansasBlog: Study: States with concealed carry laws experience rise in violent crime. http...,,,Thu Jun 22 14:15:01 +0000 2017,155078300.0,Lake Man,1554.0,56574.0,95959.0,2577.0,,0.0,[],False,0.0,False,17131180.0,Arkansas Blog,[]
1,8.778927e+17,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",,,Thu Jun 22 14:15:01 +0000 2017,7159992.0,Annie Williams,22.0,167.0,417.0,87.0,,0.0,[],False,0.0,False,,,[]
2,8.778927e+17,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",,,Thu Jun 22 14:15:01 +0000 2017,8.271887e+17,Indivisible9IL,264.0,111.0,342.0,231.0,,0.0,[],False,0.0,False,,,[]
3,8.778927e+17,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",,,Thu Jun 22 14:15:02 +0000 2017,8.313191e+17,Indivisible Illinois,1027.0,1626.0,982.0,267.0,,0.0,[],False,0.0,False,,,[]
4,8.778927e+17,Day 1 of Chicago Summit: Gregory Touhill delivers keynote on Cybersecurity=Risk Management #ISMG...,,,Thu Jun 22 14:15:02 +0000 2017,39984600.0,ISMG Network News,2395.0,119.0,12833.0,960.0,,0.0,"[Row(indices=[91, 103], text='ISMGSummits'), Row(indices=[104, 112], text='infosec')]",False,0.0,False,,,"['ISMGSummits', 'infosec']"


In [None]:
uic_favored_tags = ["UIC", "UICProud"]

def is_uic_tweet(row):
    for ftag in uic_favored_tags:
        for tag in row:
            if ftag.lower() == tag.lower():
                return True
    return False

tweets_df["uic_tweet"] = tweets_df["hashtags_cleaned"].apply(is_uic_tweet)

In [82]:
uc_favored_tags = ["uchicago", "uchearing", "universityofchicago", "pritzkerschoolofmedicine",
                   "uofc", "maroonmade", "chicagobooth"]

def is_uc_tweet(row):
    for ftag in uc_favored_tags:
        for tag in row:
            if ftag.lower() in tag.lower():
                return True
    return False

tweets_df["uc_tweet"] = tweets_df["hashtags_cleaned"].apply(is_uc_tweet)

In [83]:
upenn_favored_tags = ["upenn", "penn", "uofpenn", "universityofpennsylvania", "pennlaw"]

def is_upenn_tweet(row):
    for ftag in upenn_favored_tags:
        for tag in row:
            if ftag.lower() in tag.lower():
                return True
    return False

tweets_df["upenn_tweet"] = tweets_df["hashtags_cleaned"].apply(is_upenn_tweet)

In [87]:
display(
    tweets_df[tweets_df["uic_tweet"]==True][["id_str", "text", "hashtags_cleaned"]].shape,
    tweets_df[tweets_df["uc_tweet"]==True][["id_str", "text", "hashtags_cleaned"]].shape,
    tweets_df[tweets_df["upenn_tweet"]==True][["id_str", "text", "hashtags_cleaned"]].shape,
    tweets_df.shape
)

(55, 3)

(32, 3)

(38, 3)

(281485, 23)

In [53]:
tweets_df = None

counter = 10

fixed_col_names = [
    "id_str", 
    "text",
    "in_reply_to_status_id_str",
    "in_reply_to_user_id_str", 
    "created_at",
    # User columns
    "user_id_str",
    "user_name",
    "user_followers_count",
    "user_favourites_count",
    "user_statuses_count",
    "user_friends_count",
    # Other attributes
    "coordinates",
    "favorite_count",
    "entities_hashtags",
    "favorited", 
    # Retweet columns
    "retweet_count", 
    "retweeted",
    "retweeted_status_user_id_str",
    "retweeted_status_user_name"
]


for file in all_files:
    df = spark.read.json('hdfs://'+file)
    tmp_df = df.select(fields_to_keep).toPandas()
    tmp_df.columns = fixed_col_names
    
    def clean_hashtags(row):
        if row is None:
            return []
        if row is np.NaN:
            return []
        if (len(row) == 0):
            return []
        tags = []
        for item in row:
            tags.append(item.text)
        return tags
    
    tmp_df["hashtags_cleaned"] = tmp_df["entities_hashtags"].apply(clean_hashtags)
    
    # Add a date column by parsing the file name
    s = '/tweets'
    l = file.find(s)
    timestamp = file[l + len(s):-5]
    tmp_df["scraped_timestamp"] = timestamp
    
    # ----------- U Chicago
    
    def is_uc_tweet(row):
        for ftag in uc_favored_tags:
            for tag in row:
                if ftag.lower() in tag.lower():
                    return True
        return False

    tmp_df["uc_tweet"] = tmp_df["hashtags_cleaned"].apply(is_uc_tweet)
    
    # ----------- U Penn
    
    def is_upenn_tweet(row):
        for ftag in upenn_favored_tags:
            for tag in row:
                if ftag.lower() in tag.lower():
                    return True
        return False

    tmp_df["upenn_tweet"] = tmp_df["hashtags_cleaned"].apply(is_upenn_tweet)
    
    # ----------- UCLA
    
    def is_ucla_tweet(row):
        for ftag in ucla_favored_tags:
            for tag in row:
                if ftag.lower() == tag.lower():
                    return True
        return False

    tmp_df["ucla_tweet"] = tmp_df["hashtags_cleaned"].apply(is_ucla_tweet)
    
    
    if tweets_df is None:
        tweets_df = pd.DataFrame(columns=tmp_df.columns)
        
    tweets_df = tweets_df.append(tmp_df[tmp_df["uc_tweet"] == True], ignore_index=True)
    tweets_df = tweets_df.append(tmp_df[tmp_df["upenn_tweet"] == True], ignore_index=True)
    tweets_df = tweets_df.append(tmp_df[tmp_df["ucla_tweet"] == True], ignore_index=True)

    counter -= 1
    if counter <= 0:
        break

In [54]:
display(
    tweets_df["uc_tweet"].unique(),
    tweets_df["upenn_tweet"].unique(),
    tweets_df["ucla_tweet"].unique()
)

array([True, False], dtype=object)

array([False, True], dtype=object)

array([False, True], dtype=object)

In [55]:
tweets_df[tweets_df["ucla_tweet"]==True]

Unnamed: 0,id_str,text,in_reply_to_status_id_str,in_reply_to_user_id_str,created_at,user_id_str,user_name,user_followers_count,user_favourites_count,user_statuses_count,user_friends_count,coordinates,favorite_count,entities_hashtags,favorited,retweet_count,retweeted,retweeted_status_user_id_str,retweeted_status_user_name,hashtags_cleaned,scraped_timestamp,uc_tweet,upenn_tweet,ucla_tweet
8,877897493257699328,Tic Tac Toe is the way to go!\n\n#DulceVidaJuiceBar #FreshFruit #HanoveraPark \n#TicTacToe #Juic...,,,Thu Jun 22 14:34:07 +0000 2017,2478943886,Dulce Vida Juice Bar,374.0,9.0,668.0,343.0,,0.0,"[([31, 49], DulceVidaJuiceBar), ([50, 61], FreshFruit), ([62, 75], HanoveraPark), ([77, 87], Tic...",False,0.0,False,,,"[DulceVidaJuiceBar, FreshFruit, HanoveraPark, TicTacToe, JuiceBar, Chicago]",201706221015,False,False,True
10,877916941217701889,jmhboxing's photo https://t.co/SfnezhKJ7q #boxing #chicago #chicagoboxing #uic,,,Thu Jun 22 15:51:24 +0000 2017,365483278,Ttttttttttt,1.0,1.0,6.0,11.0,,0.0,"[([42, 49], boxing), ([50, 58], chicago), ([59, 73], chicagoboxing), ([74, 78], uic)]",False,0.0,False,,,"[boxing, chicago, chicagoboxing, uic]",201706221115,False,False,True
19,877944169011879936,RT @1BoomerWilliams: QuickBooks University - https://t.co/vdJFoOmXnd via @Shareaholic #QuickBook...,,,Thu Jun 22 17:39:35 +0000 2017,858823611729051648,Boomer Williams,428.0,641.0,353.0,728.0,,0.0,"[([86, 97], QuickBooks), ([98, 108], Education), ([109, 121], Bookkeeping)]",False,0.0,False,8.588236117290515e+17,Boomer Williams,"[QuickBooks, Education, Bookkeeping]",201706221315,False,False,True
24,877970493181009920,Faculty Development today on how to create an 'unfolding case' - @StudyPathology leads the way! ...,,,Thu Jun 22 19:24:12 +0000 2017,14334835,Max Anderson,925.0,223.0,11238.0,648.0,,0.0,"[([96, 100], uic), ([101, 107], uicom)]",False,0.0,False,,,"[uic, uicom]",201706221515,False,False,True
25,877983224496087040,RT @1BoomerWilliams: QuickBooks University - https://t.co/vdJFoOmXnd via @Shareaholic #QuickBook...,,,Thu Jun 22 20:14:47 +0000 2017,858823611729051648,Boomer Williams,403.0,675.0,378.0,760.0,,0.0,"[([86, 97], QuickBooks), ([98, 108], Education), ([109, 121], Bookkeeping)]",False,0.0,False,8.588236117290515e+17,Boomer Williams,"[QuickBooks, Education, Bookkeeping]",201706221515,False,False,True
31,878028998869041152,An inspiring podcast transcribed!\n\nRead how @jackieo_nyc ignited the spirit of #SuicidePrevent...,,,Thu Jun 22 23:16:40 +0000 2017,2334705361,Denise McDermott MD,15631.0,9002.0,6472.0,7459.0,,0.0,"[([79, 97], SuicidePrevention)]",False,0.0,False,,,[SuicidePrevention],201706221915,False,False,True
32,878036577259143168,RT @DrDeniseMD: Here is my interview with @jackieo_nyc on her role in mental health advocacy &am...,,,Thu Jun 22 23:46:47 +0000 2017,2334705361,Denise McDermott MD,15633.0,9003.0,6472.0,7459.0,,0.0,"[([99, 117], suicideprevention), ([118, 127], students)]",False,0.0,False,2334705361.0,Denise McDermott MD,"[suicideprevention, students]",201706221915,False,False,True
33,878039623900868608,RT @DrDeniseMD: An inspiring podcast transcribed!\n\nRead how @jackieo_nyc ignited the spirit of...,,,Thu Jun 22 23:58:54 +0000 2017,2895951606,Mindfulness Wellness,155354.0,87003.0,86759.0,103129.0,,0.0,"[([95, 113], SuicidePrevention)]",False,0.0,False,2334705361.0,Denise McDermott MD,[SuicidePrevention],201706221915,False,False,True
34,878039877043879937,RT @DrDeniseMD: An inspiring podcast transcribed!\n\nRead how @jackieo_nyc ignited the spirit of...,,,Thu Jun 22 23:59:54 +0000 2017,3363848398,Quotes & Facts,15489.0,58137.0,56748.0,14985.0,,0.0,"[([95, 113], SuicidePrevention)]",False,0.0,False,2334705361.0,Denise McDermott MD,[SuicidePrevention],201706221915,False,False,True
35,878039984996888576,RT @DrDeniseMD: An inspiring podcast transcribed!\n\nRead how @jackieo_nyc ignited the spirit of...,,,Fri Jun 23 00:00:20 +0000 2017,3328884245,Mindfulness,10441.0,59553.0,58150.0,10471.0,,0.0,"[([95, 113], SuicidePrevention)]",False,0.0,False,2334705361.0,Denise McDermott MD,[SuicidePrevention],201706221915,False,False,True


In [51]:
display(
    tweets_df.shape,
    tweets_df.head(5)
)

(26, 24)

Unnamed: 0,id_str,text,in_reply_to_status_id_str,in_reply_to_user_id_str,created_at,user_id_str,user_name,user_followers_count,user_favourites_count,user_statuses_count,user_friends_count,coordinates,favorite_count,entities_hashtags,favorited,retweet_count,retweeted,retweeted_status_user_id_str,retweeted_status_user_name,hashtags_cleaned,scraped_timestamp,uc_tweet,upenn_tweet,ucla_tweet
0,877893467539738624,RT @MirandaWeinberg: Morning! Today I'll be live from #UPennHearing on behalf of @GETUPgrads &am...,,,Thu Jun 22 14:18:07 +0000 2017,274415582,Danielle Hanley,131.0,637.0,577.0,288.0,,0.0,"[([54, 67], UPennHearing), ([133, 143], UChearing)]",False,0.0,False,1158927727.0,Miranda Weinberg,"[UPennHearing, UChearing]",201706221015,True,False,False
1,877898680371752960,Susan Grants #artistsbooks are featured in our new #exhibition #Art in the Stacks now open until...,,,Thu Jun 22 14:38:50 +0000 2017,2401817912,UChicago SpecColl,1171.0,810.0,1150.0,429.0,,0.0,"[([13, 26], artistsbooks), ([51, 62], exhibition), ([63, 67], Art), ([103, 112], uchicago)]",False,0.0,False,,,"[artistsbooks, exhibition, Art, uchicago]",201706221015,True,False,False
2,877900699358638080,@JohnBKing OSP-CP at #uchicago helps #firstgen students prepare for #college and #careers! #trio...,,8.192240539197399e+17,Thu Jun 22 14:46:51 +0000 2017,243878196,Special Programs,316.0,290.0,1097.0,780.0,,0.0,"[([21, 30], uchicago), ([37, 46], firstgen), ([68, 76], college), ([81, 89], careers), ([91, 104...",False,0.0,False,,,"[uchicago, firstgen, college, careers, trioprograms]",201706221015,True,False,False
3,877901149722075138,RT @MirandaWeinberg: Morning! Today I'll be live from #UPennHearing on behalf of @GETUPgrads &am...,,,Thu Jun 22 14:48:39 +0000 2017,357559523,Ozan KIRATLI,92.0,254.0,274.0,127.0,,0.0,"[([54, 67], UPennHearing), ([133, 143], UChearing)]",False,0.0,False,1158927727.0,Miranda Weinberg,"[UPennHearing, UChearing]",201706221015,True,False,False
4,877897764847341569,"Moving on to sources of revenue for different schools, including tuition, grants &amp;""the unive...",,,Thu Jun 22 14:35:12 +0000 2017,1158927727,Miranda Weinberg,588.0,2807.0,1646.0,1090.0,,0.0,"[([131, 144], upennhearing)]",False,0.0,False,,,[upennhearing],201706221015,False,True,False


In [None]:
tweets_df.iloc[0,9]

-----------------------

------------

In [13]:
df = spark.read.json('hdfs:///user/ivy2/Tweets/tweets201706221015.json')
df.cache()
df.count()

9973

In [24]:
tmp_df = df.select(fields_to_keep).toPandas()

In [25]:
tmp_df.head()

Unnamed: 0,id_str,text,in_reply_to_status_id_str,in_reply_to_user_id_str,created_at,id_str.1,name,followers_count,favourites_count,statuses_count,friends_count,coordinates,favorite_count,hashtags,favorited,retweet_count,retweeted,id_str.2,name.1
0,877892686513975296,RT @ArkansasBlog: Study: States with concealed carry laws experience rise in violent crime. http...,,,Thu Jun 22 14:15:01 +0000 2017,155078285,Lake Man,1554.0,56574.0,95959.0,2577.0,,0.0,[],False,0.0,False,17131180.0,Arkansas Blog
1,877892684756566016,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",,,Thu Jun 22 14:15:01 +0000 2017,7159992,Annie Williams,22.0,167.0,417.0,87.0,,0.0,[],False,0.0,False,,
2,877892686744698882,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",,,Thu Jun 22 14:15:01 +0000 2017,827188692766834688,Indivisible9IL,264.0,111.0,342.0,231.0,,0.0,[],False,0.0,False,,
3,877892689118715904,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",,,Thu Jun 22 14:15:02 +0000 2017,831319111414210560,Indivisible Illinois,1027.0,1626.0,982.0,267.0,,0.0,[],False,0.0,False,,
4,877892690242936834,Day 1 of Chicago Summit: Gregory Touhill delivers keynote on Cybersecurity=Risk Management #ISMG...,,,Thu Jun 22 14:15:02 +0000 2017,39984598,ISMG Network News,2395.0,119.0,12833.0,960.0,,0.0,"[([91, 103], ISMGSummits), ([104, 112], infosec)]",False,0.0,False,,


##### Clean up hashtag column

In [26]:
def clean_hashtags(row):
    if row is None:
        return []
    if row is np.NaN:
        return []
    if (len(row) == 0):
        return []
    tags = []
    for item in row:
        tags.append(item.text)
    return tags
tmp_df["hashtags_cleaned"] = tmp_df["hashtags"].apply(clean_hashtags)

In [27]:
tmp_df.head(10)

Unnamed: 0,id_str,text,in_reply_to_status_id_str,in_reply_to_user_id_str,created_at,id_str.1,name,followers_count,favourites_count,statuses_count,friends_count,coordinates,favorite_count,hashtags,favorited,retweet_count,retweeted,id_str.2,name.1,hashtags_cleaned
0,877892686513975296,RT @ArkansasBlog: Study: States with concealed carry laws experience rise in violent crime. http...,,,Thu Jun 22 14:15:01 +0000 2017,155078285,Lake Man,1554.0,56574.0,95959.0,2577.0,,0.0,[],False,0.0,False,17131180.0,Arkansas Blog,[]
1,877892684756566016,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",,,Thu Jun 22 14:15:01 +0000 2017,7159992,Annie Williams,22.0,167.0,417.0,87.0,,0.0,[],False,0.0,False,,,[]
2,877892686744698882,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",,,Thu Jun 22 14:15:01 +0000 2017,827188692766834688,Indivisible9IL,264.0,111.0,342.0,231.0,,0.0,[],False,0.0,False,,,[]
3,877892689118715904,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",,,Thu Jun 22 14:15:02 +0000 2017,831319111414210560,Indivisible Illinois,1027.0,1626.0,982.0,267.0,,0.0,[],False,0.0,False,,,[]
4,877892690242936834,Day 1 of Chicago Summit: Gregory Touhill delivers keynote on Cybersecurity=Risk Management #ISMG...,,,Thu Jun 22 14:15:02 +0000 2017,39984598,ISMG Network News,2395.0,119.0,12833.0,960.0,,0.0,"[([91, 103], ISMGSummits), ([104, 112], infosec)]",False,0.0,False,,,"[ISMGSummits, infosec]"
5,877892690490163200,Day 1 of Chicago Summit: Gregory Touhill delivers keynote on Cybersecurity=Risk Management #ISMG...,,,Thu Jun 22 14:15:02 +0000 2017,386744032,DataBreachToday,3994.0,155.0,4556.0,619.0,,0.0,"[([91, 103], ISMGSummits), ([104, 112], infosec)]",False,0.0,False,,,"[ISMGSummits, infosec]"
6,877892692667113472,How KU stacks up when it comes to early draft picks: https://t.co/26N3FbWf7I https://t.co/PscEHW...,,,Thu Jun 22 14:15:03 +0000 2017,71838280,Jesse Newell,12467.0,824.0,18653.0,3112.0,,0.0,[],False,0.0,False,,,[]
7,877892694629990410,RT @Wokieleaksalt: LMAO this is about that mentally disabled kid who got tortured in Chicago in ...,,,Thu Jun 22 14:15:03 +0000 2017,31628720,Rodgerick Merrilund,124.0,2904.0,15159.0,64.0,,0.0,[],False,0.0,False,8.586909250229166e+17,Wowkieleaks,[]
8,877892694592299008,Family of late Duquesne University student Dakota James launches foundation https://t.co/2lhtacK...,,,Thu Jun 22 14:15:03 +0000 2017,20269833,TribLIVE.com,62819.0,1136.0,104854.0,187.0,,0.0,[],False,0.0,False,,,[]
9,877892697050288128,"RT @ditzkoff: [deep breath] Okay: in the 1920s, the jazz singer Velma Kelly and the celebrity ob...",,,Thu Jun 22 14:15:04 +0000 2017,16102033,Sarah Mulhern Gross,7221.0,4899.0,49758.0,2014.0,,0.0,[],False,0.0,False,81482674.0,Dave Itzkoff,[]


Check if our favored hashtags exist in these

In [28]:
def is_uc_tweet(row):
    for ftag in uc_favored_tags:
        for tag in row:
            if ftag in tag:
                return True
    return False

tmp_df["uc_tweet"] = tmp_df["hashtags_cleaned"].apply(is_uc_tweet)

In [29]:
def is_upenn_tweet(row):
    for ftag in upenn_favored_tags:
        for tag in row:
            if ftag in tag:
                return True
    return False

tmp_df["upenn_tweet"] = tmp_df["hashtags_cleaned"].apply(is_upenn_tweet)

In [37]:
def is_ucla_tweet(row):
    for ftag in ucla_favored_tags:
        for tag in row:
            if ftag in tag:
                return True
    return False

tmp_df["ucla_tweet"] = tmp_df["hashtags_cleaned"].apply(is_ucla_tweet)

In [30]:
tmp_df[tmp_df["uc_tweet"] == True]

Unnamed: 0,id_str,text,in_reply_to_status_id_str,in_reply_to_user_id_str,created_at,id_str.1,name,followers_count,favourites_count,statuses_count,friends_count,coordinates,favorite_count,hashtags,favorited,retweet_count,retweeted,id_str.2,name.1,hashtags_cleaned,uc_tweet,upenn_tweet
3904,877898680371752960,Susan Grants #artistsbooks are featured in our new #exhibition #Art in the Stacks now open until...,,,Thu Jun 22 14:38:50 +0000 2017,2401817912,UChicago SpecColl,1171.0,810.0,1150.0,429.0,,0.0,"[([13, 26], artistsbooks), ([51, 62], exhibition), ([63, 67], Art), ([103, 112], uchicago)]",False,0.0,False,,,"[artistsbooks, exhibition, Art, uchicago]",True,False
5232,877900699358638080,@JohnBKing OSP-CP at #uchicago helps #firstgen students prepare for #college and #careers! #trio...,,8.192240539197399e+17,Thu Jun 22 14:46:51 +0000 2017,243878196,Special Programs,316.0,290.0,1097.0,780.0,,0.0,"[([21, 30], uchicago), ([37, 46], firstgen), ([68, 76], college), ([81, 89], careers), ([91, 104...",False,0.0,False,,,"[uchicago, firstgen, college, careers, trioprograms]",True,False


In [31]:
tmp_df[tmp_df["upenn_tweet"] == True]

Unnamed: 0,id_str,text,in_reply_to_status_id_str,in_reply_to_user_id_str,created_at,id_str.1,name,followers_count,favourites_count,statuses_count,friends_count,coordinates,favorite_count,hashtags,favorited,retweet_count,retweeted,id_str.2,name.1,hashtags_cleaned,uc_tweet,upenn_tweet
3239,877897764847341569,"Moving on to sources of revenue for different schools, including tuition, grants &amp;""the unive...",,,Thu Jun 22 14:35:12 +0000 2017,1158927727,Miranda Weinberg,588.0,2807.0,1646.0,1090.0,,0.0,"[([131, 144], upennhearing)]",False,0.0,False,,,[upennhearing],False,True
3470,877898131505004544,"Onto endowment - university can ""only"" use interest from endowment. Bet the interest off a $10.7...",,,Thu Jun 22 14:36:39 +0000 2017,1158927727,Miranda Weinberg,588.0,2807.0,1647.0,1090.0,,0.0,"[([125, 138], upennhearing)]",False,0.0,False,,,[upennhearing],False,True
4885,877900203789037568,"SF says that no university fully recoups costs of federally funded research from grants, but nee...",,,Thu Jun 22 14:44:53 +0000 2017,1158927727,Miranda Weinberg,588.0,2808.0,1652.0,1090.0,,0.0,"[([127, 140], upennhearing)]",False,0.0,False,,,[upennhearing],False,True
5012,877900371963908097,Glad to hear university hasn't totally turned into a business (yet) #upennhearing,8.779002037890374e+17,1158927727.0,Thu Jun 22 14:45:33 +0000 2017,1158927727,Miranda Weinberg,588.0,2808.0,1653.0,1090.0,,0.0,"[([68, 81], upennhearing)]",False,0.0,False,,,[upennhearing],False,True


In [38]:
tmp_df[tmp_df["ucla_tweet"] == True]

Unnamed: 0,id_str,text,in_reply_to_status_id_str,in_reply_to_user_id_str,created_at,id_str.1,name,followers_count,favourites_count,statuses_count,friends_count,coordinates,favorite_count,hashtags,favorited,retweet_count,retweeted,id_str.2,name.1,hashtags_cleaned,uc_tweet,upenn_tweet,ucla_tweet


In [34]:
display(
    tmp_df[tmp_df["upenn_tweet"] == True].shape,
    tmp_df[tmp_df["uc_tweet"] == True].shape
)

(4, 22)

(2, 22)

In [103]:
# for hashlist in tmp_df["hashtags_cleaned"]:
#     for tag in hashlist:
#         add_item_to_list(all_hashtags["uchicago"], tag, unique=True)
# all_hashtags["uchicago"].sort()
# print(len(all_hashtags["uchicago"]))

# uc_tags = []
# for ftag in uc_favored_tags:
#     for tag in all_hashtags:
#         if ftag in tag:
#             uc_tags.append(tag) 

# uc_tags