# Heading 

In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
from IPython.core.display import display, HTML
import pandas as pd
%reload_ext autoreload
%autoreload 1
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth',100)    

display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from itertools import combinations, takewhile
import collections

from simhash import Simhash, SimhashIndex

sns.set()

print(sys.version)

3.6.8 |Anaconda custom (64-bit)| (default, Dec 30 2018, 01:22:34) 
[GCC 7.3.0]


In [3]:
print(spark.version)

2.4.0-cdh6.1.0


In [4]:
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
# from pyspark.ml.feature import OneHotEncoderEstimator
# OneHotEncoderEstimator is available starting from Spark 2.3
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.sql.functions import *
from pyspark.sql.types import *

In [5]:
!hdfs dfs -ls -h '/user/ivy2/Tweets/' > '/home/sriharis/git_projects/BigDataEngg/final_project/file_list.txt'
tweets_path = '/user/ivy2/Tweets/'



### Some helper functions

In [38]:
def add_item_to_list(arr, val, unique=False):
    if val in arr:
        if not unique:
            return arr.extend([val])
    if val not in arr:
        return arr.extend([val])

In [96]:
all_files = []

def read_all_lines(fname):
    with open(fname) as f:
        content = f.readlines()
        for line in content:
            start_loc = line.find('/user/ivy2/Tweets/')
            if start_loc < 0:
                continue
            all_files.append(line[start_loc:].strip())
    
read_all_lines('./file_list.txt')    

In [97]:
all_files[0]

'/user/ivy2/Tweets/tweets201706221015.json'

#### Placeholder_variables

In [76]:
all_hashtags = {
    "uchicago": [],
    "uirrelevant": []
}

uc_favored_tags = ["uchicago", "uchearing", "uchicagostudents", "uchicagomedicine", 
                   "uchicagonsi", "uchicagotoday", "pritzkerschoolofmedicine", "uchicagoarts", 
                   "uofc", "uchicagoalumni","uchicagograham", "maroonmade", "uchicagompcs", "chicagobooth"]

#### Let's open a sample file and play with it for a while

In [7]:
df = spark.read.json('hdfs:///user/ivy2/Tweets/tweets201706221015.json')
df.cache()
df.count()

9973

In [12]:
fields_to_keep = ["id_str", 
                  "text",
#                   "truncated",
#                   "in_reply_to_status_id_str",
#                   "in_reply_to_user_id_str", 
#                   "user.favourites_count", 
#                   "user.id_str",
#                   "coordinates",
#                   "quoted_status_id_str", 
#                   "is_quote_status", 
#                   "retweet_count", 
#                   "favorite_count",
                  "entities.hashtags" 
#                   "favorited", 
#                   "retweeted",
#                   "possibly_sensitive",
#                   "lang" 
                 ]

In [56]:
tmp_df = df.select(fields_to_keep).toPandas()

In [57]:
tmp_df.head()

Unnamed: 0,id_str,text,hashtags
0,877892686513975296,RT @ArkansasBlog: Study: States with concealed carry laws experience rise in violent crime. http...,[]
1,877892684756566016,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",[]
2,877892686744698882,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",[]
3,877892689118715904,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",[]
4,877892690242936834,Day 1 of Chicago Summit: Gregory Touhill delivers keynote on Cybersecurity=Risk Management #ISMG...,"[([91, 103], ISMGSummits), ([104, 112], infosec)]"


##### Clean up hashtag column

In [58]:
def clean_hashtags(row):
    if row is None:
        return []
    if row is np.NaN:
        return []
    if (len(row) == 0):
        return []
    tags = []
    for item in row:
        tags.append(item.text)
    return tags
tmp_df["hashtags_cleaned"] = tmp_df["hashtags"].apply(clean_hashtags)

In [59]:
tmp_df.head(10)

Unnamed: 0,id_str,text,hashtags,hashtags_cleaned
0,877892686513975296,RT @ArkansasBlog: Study: States with concealed carry laws experience rise in violent crime. http...,[],[]
1,877892684756566016,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",[],[]
2,877892686744698882,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",[],[]
3,877892689118715904,"IL Healthcare Truth Tour: Heal Don't Repeal! 6/22 2:30PM Federal Plaza, Chicago. Mobile billboar...",[],[]
4,877892690242936834,Day 1 of Chicago Summit: Gregory Touhill delivers keynote on Cybersecurity=Risk Management #ISMG...,"[([91, 103], ISMGSummits), ([104, 112], infosec)]","[ISMGSummits, infosec]"
5,877892690490163200,Day 1 of Chicago Summit: Gregory Touhill delivers keynote on Cybersecurity=Risk Management #ISMG...,"[([91, 103], ISMGSummits), ([104, 112], infosec)]","[ISMGSummits, infosec]"
6,877892692667113472,How KU stacks up when it comes to early draft picks: https://t.co/26N3FbWf7I https://t.co/PscEHW...,[],[]
7,877892694629990410,RT @Wokieleaksalt: LMAO this is about that mentally disabled kid who got tortured in Chicago in ...,[],[]
8,877892694592299008,Family of late Duquesne University student Dakota James launches foundation https://t.co/2lhtacK...,[],[]
9,877892697050288128,"RT @ditzkoff: [deep breath] Okay: in the 1920s, the jazz singer Velma Kelly and the celebrity ob...",[],[]


Check if our favored hashtags exist in these

In [65]:
def is_uc_tweet(row):
    for ftag in uc_favored_tags:
        for tag in row:
            if ftag in tag:
                return True
    return False

tmp_df["uc_tweet"] = tmp_df["hashtags_cleaned"].apply(is_uc_tweet)

In [66]:
tmp_df[tmp_df["uc_tweet"] == True]

Unnamed: 0,id_str,text,hashtags,hashtags_cleaned,uc_tweet
3904,877898680371752960,Susan Grants #artistsbooks are featured in our new #exhibition #Art in the Stacks now open until...,"[([13, 26], artistsbooks), ([51, 62], exhibition), ([63, 67], Art), ([103, 112], uchicago)]","[artistsbooks, exhibition, Art, uchicago]",True
5232,877900699358638080,@JohnBKing OSP-CP at #uchicago helps #firstgen students prepare for #college and #careers! #trio...,"[([21, 30], uchicago), ([37, 46], firstgen), ([68, 76], college), ([81, 89], careers), ([91, 104...","[uchicago, firstgen, college, careers, trioprograms]",True


In [103]:
# for hashlist in tmp_df["hashtags_cleaned"]:
#     for tag in hashlist:
#         add_item_to_list(all_hashtags["uchicago"], tag, unique=True)
# all_hashtags["uchicago"].sort()
# print(len(all_hashtags["uchicago"]))

# uc_tags = []
# for ftag in uc_favored_tags:
#     for tag in all_hashtags:
#         if ftag in tag:
#             uc_tags.append(tag) 

# uc_tags

In [74]:
if tweets_df is None:
    tweets_df = pd.DataFrame(columns=tmp_df.columns)
tweets_df = tweets_df.append(tmp_df[tmp_df["uc_tweet"] == True])
tweets_df.reset_index(drop=False, inplace=True)

## Get all the relevant hashtags and populate a dataframe 

In [None]:
for file in all_files[]