# Heading 

In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
from IPython.core.display import display, HTML
import pandas as pd
%reload_ext autoreload
%autoreload 1
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth',100)    

display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from itertools import combinations, takewhile
import collections

from simhash import Simhash, SimhashIndex

sns.set()

print(sys.version)

3.6.8 |Anaconda custom (64-bit)| (default, Dec 30 2018, 01:22:34) 
[GCC 7.3.0]


In [3]:
print(spark.version)

2.4.0-cdh6.1.0


In [4]:
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
# from pyspark.ml.feature import OneHotEncoderEstimator
# OneHotEncoderEstimator is available starting from Spark 2.3
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.sql.functions import *
from pyspark.sql.types import *

In [5]:
!hdfs dfs -ls -h '/user/ivy2/Tweets/' > '/home/sriharis/git_projects/BigDataEngg/final_project/file_list.txt'
tweets_path = '/user/ivy2/Tweets/'



### Some helper functions

In [6]:
def add_item_to_list(arr, val, unique=False):
    if val in arr:
        if not unique:
            return arr.extend([val])
    if val not in arr:
        return arr.extend([val])

In [7]:
all_files = []

def read_all_lines(fname):
    with open(fname) as f:
        content = f.readlines()
        for line in content:
            start_loc = line.find('/user/ivy2/Tweets/')
            if start_loc < 0:
                continue
            all_files.append(line[start_loc:].strip())
    
read_all_lines('./file_list.txt')    

In [8]:
all_files[0]

'/user/ivy2/Tweets/tweets201706221015.json'

In [9]:
a = all_files[0]
s = '/tweets'
l = a.find(s)
a[l + len(s):-5]

'201706221015'

#### Placeholder_variables

In [10]:
all_hashtags = {
    "uchicago": [],
    "upenn": []
}

uc_favored_tags = ["uchicago", "uchearing", "uchicagostudents", "uchicagomedicine", "universityofchicago",
                   "uchicagonsi", "uchicagotoday", "pritzkerschoolofmedicine", "uchicagoarts", 
                   "uofc", "uchicagoalumni","uchicagograham", "maroonmade", "uchicagompcs", "chicagobooth"]

upenn_favored_tags = ["upenn", "penn", "uofpenn", "universityofpennsylvania"]

In [11]:
fields_to_keep = ["id_str", 
                  "text",
                  "in_reply_to_status_id_str",
                  "in_reply_to_user_id_str", 
                  "created_at",
                  # User columns
                  "user.id_str",
                  "user.name",
                  "user.followers_count",
                  "user.favourites_count",
                  "user.statuses_count",
                  "user.friends_count",
                  # Other attributes
                  "coordinates",
                  "favorite_count",
                  "entities.hashtags",
                  "favorited", 
                  # Retweet columns
                  "retweet_count", 
                  "retweeted",
                  "retweeted_status.user.id_str",
                  "retweeted_status.user.name"
                 ]

In [12]:
all_files[0]

'/user/ivy2/Tweets/tweets201706221015.json'

## Get all the relevant hashtags and populate a dataframe 

In [None]:
tweets_df = None

counter = 10

fixed_col_names = [
    "id_str", 
    "text",
    "in_reply_to_status_id_str",
    "in_reply_to_user_id_str", 
    "created_at",
    # User columns
    "user_id_str",
    "user_name",
    "user_followers_count",
    "user_favourites_count",
    "user_statuses_count",
    "user_friends_count",
    # Other attributes
    "coordinates",
    "favorite_count",
    "entities_hashtags",
    "favorited", 
    # Retweet columns
    "retweet_count", 
    "retweeted",
    "retweeted_status_user_id_str",
    "retweeted_status_user_name"
]


for file in all_files:
    df = spark.read.json('hdfs://'+file)
    tmp_df = df.select(fields_to_keep).toPandas()
    tmp_df.columns = fixed_col_names
    
    def clean_hashtags(row):
        if row is None:
            return []
        if row is np.NaN:
            return []
        if (len(row) == 0):
            return []
        tags = []
        for item in row:
            tags.append(item.text)
        return tags
    
    tmp_df["hashtags_cleaned"] = tmp_df["entities_hashtags"].apply(clean_hashtags)
    
    # Add a date column by parsing the file name
    s = '/tweets'
    l = file.find(s)
    timestamp = file[l + len(s):-5]
    tmp_df["scraped_timestamp"] = timestamp
    
    # ----------- U Chicago
    
    def is_uc_tweet(row):
        for ftag in uc_favored_tags:
            for tag in row:
                if ftag.lower() in tag.lower():
                    return True
        return False

    tmp_df["uc_tweet"] = tmp_df["hashtags_cleaned"].apply(is_uc_tweet)
    
    # ----------- U Penn
    
    if tweets_df is None:
        tweets_df = pd.DataFrame(columns=tmp_df.columns)
        
    tweets_df = tweets_df.append(tmp_df[tmp_df["uc_tweet"] == True], ignore_index=True)
    
    counter -= 1
    if counter <= 0:
        break

In [None]:
display(
    tweets_df.shape,
    tweets_df.head(5)
)

In [None]:
tweets_df.iloc[0,9]

-----------------------

------------

In [None]:
df = spark.read.json('hdfs:///user/ivy2/Tweets/tweets201706221015.json')
df.cache()
df.count()

In [None]:
tmp_df = df.select(fields_to_keep).toPandas()

In [None]:
tmp_df.head()

##### Clean up hashtag column

In [None]:
def clean_hashtags(row):
    if row is None:
        return []
    if row is np.NaN:
        return []
    if (len(row) == 0):
        return []
    tags = []
    for item in row:
        tags.append(item.text)
    return tags
tmp_df["hashtags_cleaned"] = tmp_df["hashtags"].apply(clean_hashtags)

In [None]:
tmp_df.head(10)

Check if our favored hashtags exist in these

In [None]:
def is_uc_tweet(row):
    for ftag in uc_favored_tags:
        for tag in row:
            if ftag in tag:
                return True
    return False

tmp_df["uc_tweet"] = tmp_df["hashtags_cleaned"].apply(is_uc_tweet)

In [None]:
def is_upenn_tweet(row):
    for ftag in upenn_favored_tags:
        for tag in row:
            if ftag in tag:
                return True
    return False

tmp_df["upenn_tweet"] = tmp_df["hashtags_cleaned"].apply(is_upenn_tweet)

In [None]:
tmp_df[tmp_df["uc_tweet"] == True]

In [None]:
tmp_df[tmp_df["upenn_tweet"] == True]

In [103]:
# for hashlist in tmp_df["hashtags_cleaned"]:
#     for tag in hashlist:
#         add_item_to_list(all_hashtags["uchicago"], tag, unique=True)
# all_hashtags["uchicago"].sort()
# print(len(all_hashtags["uchicago"]))

# uc_tags = []
# for ftag in uc_favored_tags:
#     for tag in all_hashtags:
#         if ftag in tag:
#             uc_tags.append(tag) 

# uc_tags