# Heading 

In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
from IPython.core.display import display, HTML
import pandas as pd
%reload_ext autoreload
%autoreload 1
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth',100)    

display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from itertools import combinations, takewhile
import collections

from simhash import Simhash, SimhashIndex

sns.set()

print(sys.version)

3.6.8 |Anaconda custom (64-bit)| (default, Dec 30 2018, 01:22:34) 
[GCC 7.3.0]


In [3]:
print(spark.version)

2.4.0-cdh6.1.0


In [4]:
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
# from pyspark.ml.feature import OneHotEncoderEstimator
# OneHotEncoderEstimator is available starting from Spark 2.3
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.sql.functions import *
from pyspark.sql.types import *

In [5]:
!hdfs dfs -ls -h '/user/ivy2/Tweets/' > '/home/sriharis/git_projects/BigDataEngg/final_project/file_list.txt'
tweets_path = '/user/ivy2/Tweets/'



### Some helper functions

In [38]:
def add_item_to_list(arr, val, unique=False):
    if val in arr:
        if not unique:
            return arr.extend([val])
    if val not in arr:
        return arr.extend([val])

In [96]:
all_files = []

def read_all_lines(fname):
    with open(fname) as f:
        content = f.readlines()
        for line in content:
            start_loc = line.find('/user/ivy2/Tweets/')
            if start_loc < 0:
                continue
            all_files.append(line[start_loc:].strip())
    
read_all_lines('./file_list.txt')    

In [97]:
all_files[0]

'/user/ivy2/Tweets/tweets201706221015.json'

In [150]:
a = all_files[0]
s = '/tweets'
l = a.find(s)
a[l + len(s):-5]

'201706221015'

#### Placeholder_variables

In [155]:
all_hashtags = {
    "uchicago": [],
    "upenn": []
}

uc_favored_tags = ["uchicago", "uchearing", "uchicagostudents", "uchicagomedicine", "universityofchicago",
                   "uchicagonsi", "uchicagotoday", "pritzkerschoolofmedicine", "uchicagoarts", 
                   "uofc", "uchicagoalumni","uchicagograham", "maroonmade", "uchicagompcs", "chicagobooth"]

upenn_favored_tags = ["upenn", "penn", "uofpenn", "universityofpennsylvania"]

In [151]:
fields_to_keep = ["id_str", 
                  "text",
                  "in_reply_to_status_id_str",
                  "in_reply_to_user_id_str", 
                  "created_at",
                  # User columns
                  "user.id_str",
                  "user.name",
                  "user.followers_count",
                  "user.favourites_count",
                  "user.statuses_count",
                  "user.friends_count",
                  # Other attributes
                  "coordinates",
                  "favorite_count",
                  "entities.hashtags",
                  "favorited", 
                  # Retweet columns
                  "retweet_count", 
                  "retweeted",
                  "retweeted_status.user.id_str",
                  "retweeted_status.user.name"
                 ]

In [152]:
all_files[0]

'/user/ivy2/Tweets/tweets201706221015.json'

## Get all the relevant hashtags and populate a dataframe 

In [153]:
tweets_df = None

counter = 10

fixed_col_names = [
    "id_str", 
    "text",
    "in_reply_to_status_id_str",
    "in_reply_to_user_id_str", 
    "created_at",
    # User columns
    "user_id_str",
    "user_name",
    "user_followers_count",
    "user_favourites_count",
    "user_statuses_count",
    "user_friends_count",
    # Other attributes
    "coordinates",
    "favorite_count",
    "entities_hashtags",
    "favorited", 
    # Retweet columns
    "retweet_count", 
    "retweeted",
    "retweeted_status_user_id_str",
    "retweeted_status_user_name"
]


for file in all_files:
    df = spark.read.json('hdfs://'+file)
    tmp_df = df.select(fields_to_keep).toPandas()
    tmp_df.columns = fixed_col_names
    
    def clean_hashtags(row):
        if row is None:
            return []
        if row is np.NaN:
            return []
        if (len(row) == 0):
            return []
        tags = []
        for item in row:
            tags.append(item.text)
        return tags
    
    tmp_df["hashtags_cleaned"] = tmp_df["entities_hashtags"].apply(clean_hashtags)
    
    # Add a date column by parsing the file name
    s = '/tweets'
    l = file.find(s)
    timestamp = file[l + len(s):-5]
    tmp_df["scraped_timestamp"] = timestamp
    
    # ----------- U Chicago
    
    def is_uc_tweet(row):
        for ftag in uc_favored_tags:
            for tag in row:
                if ftag.lower() in tag.lower():
                    return True
        return False

    tmp_df["uc_tweet"] = tmp_df["hashtags_cleaned"].apply(is_uc_tweet)
    
    # ----------- U Penn
    
    if tweets_df is None:
        tweets_df = pd.DataFrame(columns=tmp_df.columns)
        
    tweets_df = tweets_df.append(tmp_df[tmp_df["uc_tweet"] == True], ignore_index=True)
    
    counter -= 1
    if counter <= 0:
        break

In [154]:
display(
    tweets_df.shape,
    tweets_df.head(5)
)

(22, 22)

Unnamed: 0,id_str,text,in_reply_to_status_id_str,in_reply_to_user_id_str,created_at,user_id_str,user_name,user_followers_count,user_favourites_count,user_statuses_count,user_friends_count,coordinates,favorite_count,entities_hashtags,favorited,retweet_count,retweeted,retweeted_status_user_id_str,retweeted_status_user_name,hashtags_cleaned,scraped_timestamp,uc_tweet
0,877893467539738624,RT @MirandaWeinberg: Morning! Today I'll be live from #UPennHearing on behalf of @GETUPgrads &am...,,,Thu Jun 22 14:18:07 +0000 2017,274415582,Danielle Hanley,131.0,637.0,577.0,288.0,,0.0,"[([54, 67], UPennHearing), ([133, 143], UChearing)]",False,0.0,False,1158927727.0,Miranda Weinberg,"[UPennHearing, UChearing]",201706221015,True
1,877898680371752960,Susan Grants #artistsbooks are featured in our new #exhibition #Art in the Stacks now open until...,,,Thu Jun 22 14:38:50 +0000 2017,2401817912,UChicago SpecColl,1171.0,810.0,1150.0,429.0,,0.0,"[([13, 26], artistsbooks), ([51, 62], exhibition), ([63, 67], Art), ([103, 112], uchicago)]",False,0.0,False,,,"[artistsbooks, exhibition, Art, uchicago]",201706221015,True
2,877900699358638080,@JohnBKing OSP-CP at #uchicago helps #firstgen students prepare for #college and #careers! #trio...,,8.192240539197399e+17,Thu Jun 22 14:46:51 +0000 2017,243878196,Special Programs,316.0,290.0,1097.0,780.0,,0.0,"[([21, 30], uchicago), ([37, 46], firstgen), ([68, 76], college), ([81, 89], careers), ([91, 104...",False,0.0,False,,,"[uchicago, firstgen, college, careers, trioprograms]",201706221015,True
3,877901149722075138,RT @MirandaWeinberg: Morning! Today I'll be live from #UPennHearing on behalf of @GETUPgrads &am...,,,Thu Jun 22 14:48:39 +0000 2017,357559523,Ozan KIRATLI,92.0,254.0,274.0,127.0,,0.0,"[([54, 67], UPennHearing), ([133, 143], UChearing)]",False,0.0,False,1158927727.0,Miranda Weinberg,"[UPennHearing, UChearing]",201706221015,True
4,877918472734466049,"INTRODUCING.....the ""DRAGON BERRY BLAST"" #banana #pineapple #coconut #raspberryswirl #chicagogra...",,,Thu Jun 22 15:57:29 +0000 2017,863813389,ｒｏｂｕｓｔ ｃｏｆｆｅｅ,2569.0,3121.0,2033.0,3204.0,,0.0,"[([41, 48], banana), ([49, 59], pineapple), ([60, 68], coconut), ([69, 84], raspberryswirl), ([8...",False,0.0,False,,,"[banana, pineapple, coconut, raspberryswirl, chicagogram, uchicago]",201706221115,True


In [129]:
tweets_df.iloc[0,9]

Row(contributors=None, coordinates=None, created_at='Thu Jun 22 13:44:37 +0000 2017', display_text_range=None, entities=Row(hashtags=[Row(indices=[33, 46], text='UPennHearing'), Row(indices=[112, 122], text='UChearing')], media=None, symbols=[], urls=[], user_mentions=[Row(id=837491901515325440, id_str='837491901515325440', indices=[60, 71], name='GET-UP', screen_name='GETUPgrads'), Row(id=836256344135057408, id_str='836256344135057408', indices=[78, 90], name='Grads Rising', screen_name='GradsRising')]), extended_entities=None, extended_tweet=None, favorite_count=4, favorited=False, filter_level='low', geo=None, id=877885037101887488, id_str='877885037101887488', in_reply_to_screen_name=None, in_reply_to_status_id=None, in_reply_to_status_id_str=None, in_reply_to_user_id=None, in_reply_to_user_id_str=None, is_quote_status=False, lang='en', place=Row(bounding_box=Row(coordinates=[[[-75.280284, 39.871811], [-75.280284, 40.13792], [-74.955712, 40.13792], [-74.955712, 39.871811]]], type='

-----------------------

------------

In [156]:
df = spark.read.json('hdfs:///user/ivy2/Tweets/tweets201706221015.json')
df.cache()
df.count()

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving


Py4JError: An error occurred while calling o60.read

In [157]:
tmp_df = df.select(fields_to_keep).toPandas()

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36169)
Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:36169)

In [57]:
tmp_df.head()

Unnamed: 0,id_str,text,hashtags
0,877892686513975296,RT @ArkansasBlog: Study: States with concealed carry laws experience rise in violent crime. http...,[]
1,877892684756566016,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",[]
2,877892686744698882,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",[]
3,877892689118715904,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",[]
4,877892690242936834,Day 1 of Chicago Summit: Gregory Touhill delivers keynote on Cybersecurity=Risk Management #ISMG...,"[([91, 103], ISMGSummits), ([104, 112], infosec)]"


##### Clean up hashtag column

In [58]:
def clean_hashtags(row):
    if row is None:
        return []
    if row is np.NaN:
        return []
    if (len(row) == 0):
        return []
    tags = []
    for item in row:
        tags.append(item.text)
    return tags
tmp_df["hashtags_cleaned"] = tmp_df["hashtags"].apply(clean_hashtags)

In [59]:
tmp_df.head(10)

Unnamed: 0,id_str,text,hashtags,hashtags_cleaned
0,877892686513975296,RT @ArkansasBlog: Study: States with concealed carry laws experience rise in violent crime. http...,[],[]
1,877892684756566016,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",[],[]
2,877892686744698882,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",[],[]
3,877892689118715904,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",[],[]
4,877892690242936834,Day 1 of Chicago Summit: Gregory Touhill delivers keynote on Cybersecurity=Risk Management #ISMG...,"[([91, 103], ISMGSummits), ([104, 112], infosec)]","[ISMGSummits, infosec]"
5,877892690490163200,Day 1 of Chicago Summit: Gregory Touhill delivers keynote on Cybersecurity=Risk Management #ISMG...,"[([91, 103], ISMGSummits), ([104, 112], infosec)]","[ISMGSummits, infosec]"
6,877892692667113472,How KU stacks up when it comes to early draft picks: https://t.co/26N3FbWf7I https://t.co/PscEHW...,[],[]
7,877892694629990410,RT @Wokieleaksalt: LMAO this is about that mentally disabled kid who got tortured in Chicago in ...,[],[]
8,877892694592299008,Family of late Duquesne University student Dakota James launches foundation https://t.co/2lhtacK...,[],[]
9,877892697050288128,"RT @ditzkoff: [deep breath] Okay: in the 1920s, the jazz singer Velma Kelly and the celebrity ob...",[],[]


Check if our favored hashtags exist in these

In [65]:
def is_uc_tweet(row):
    for ftag in uc_favored_tags:
        for tag in row:
            if ftag in tag:
                return True
    return False

tmp_df["uc_tweet"] = tmp_df["hashtags_cleaned"].apply(is_uc_tweet)

In [None]:
def is_upenn_tweet(row):
    for ftag in upenn_favored_tags:
        for tag in row:
            if ftag in tag:
                return True
    return False

tmp_df["upenn_tweet"] = tmp_df["hashtags_cleaned"].apply(is_upenn_tweet)

In [66]:
tmp_df[tmp_df["uc_tweet"] == True]

Unnamed: 0,id_str,text,hashtags,hashtags_cleaned,uc_tweet
3904,877898680371752960,Susan Grants #artistsbooks are featured in our new #exhibition #Art in the Stacks now open until...,"[([13, 26], artistsbooks), ([51, 62], exhibition), ([63, 67], Art), ([103, 112], uchicago)]","[artistsbooks, exhibition, Art, uchicago]",True
5232,877900699358638080,@JohnBKing OSP-CP at #uchicago helps #firstgen students prepare for #college and #careers! #trio...,"[([21, 30], uchicago), ([37, 46], firstgen), ([68, 76], college), ([81, 89], careers), ([91, 104...","[uchicago, firstgen, college, careers, trioprograms]",True


In [103]:
# for hashlist in tmp_df["hashtags_cleaned"]:
#     for tag in hashlist:
#         add_item_to_list(all_hashtags["uchicago"], tag, unique=True)
# all_hashtags["uchicago"].sort()
# print(len(all_hashtags["uchicago"]))

# uc_tags = []
# for ftag in uc_favored_tags:
#     for tag in all_hashtags:
#         if ftag in tag:
#             uc_tags.append(tag) 

# uc_tags

In [74]:
if tweets_df is None:
    tweets_df = pd.DataFrame(columns=tmp_df.columns)
tweets_df = tweets_df.append(tmp_df[tmp_df["uc_tweet"] == True])
tweets_df.reset_index(drop=False, inplace=True)