1. Since the tweets we pulled came from both weekends of Coachella, the first thing we can do to clean this dataset up (and significantly reduce it's size) is to separate the files into week1 and week2 of coachella

In [22]:
import os
import json
from os import listdir
from os.path import isfile, join

folder = os.path.join('C:/', 'users', 'charr', 'documents')

get_files = [f for f in listdir(folder) if isfile(join(folder, f))]
get_files

['coachella_tweets20191107_210240.json',
 'coachella_tweets20191107_210514.json',
 'coachella_tweets20191107_210706.json',
 'coachella_tweets20191107_211218.json',
 'coachella_tweets20191107_214356.json',
 'coachella_tweets20191107_214639.json',
 'coachella_tweets20191107_215132.json',
 'coachella_tweets20191107_220147.json',
 'coachella_tweets20191107_222255.json',
 'coachella_tweets20191107_222517.json',
 'coachella_tweets20191107_223029.json',
 'coachella_tweets20191107_230148.json',
 'coachella_tweets20191107_230342.json',
 'coachella_tweets20191107_230919.json',
 'coachella_tweets20191107_234110.json',
 'coachella_tweets20191107_234121.json',
 'coachella_tweets20191107_234730.json',
 'coachella_tweets20191108_001926.json',
 'coachella_tweets20191108_001931.json',
 'coachella_tweets20191108_002626.json',
 'coachella_tweets20191108_005654.json',
 'coachella_tweets20191108_005715.json',
 'coachella_tweets20191108_010503.json',
 'coachella_tweets20191108_013444.json',
 'coachella_twee

2. Similar to how we got the start/end dates while gathering the tweets we can use the same method to set boundries for each weekend

In [23]:
import datetime
def get_tweet_timestamp(tid):
    offset = 1288834974657
    tstamp = (tid >> 22) + offset
    utcdttime = datetime.datetime.fromtimestamp(tstamp/1000, tz=datetime.timezone.utc)
    return utcdttime
    

In [24]:
coachella_start_date_weekend2 = get_tweet_timestamp(1119106217190629378)
coachella_end_date_weekend2 = get_tweet_timestamp(1120182057383022594)
coachella_start_date_weekend1 = get_tweet_timestamp(1116479450026106880)
coachella_end_date_weekend1 = get_tweet_timestamp(1117646057750405120)

In [25]:
coachella_start_date_weekend1

datetime.datetime(2019, 4, 11, 23, 13, 38, 728000, tzinfo=datetime.timezone.utc)

3. for each json file read in we can clear any empty/midweek tweets and then write that dictionary out to a new json files

In [None]:
# clear_empty_users can be
# {k:v for k,v in dct.items() if v}

# Maybe consider a print report in get_weekend_specific_tweets
# X tweets weekend 1, Y tweets weekend 2, Z tweets removed
# You may be better off getting the tweet timestamp from the snowflake ID instead of the created_at time


In [26]:
def clear_empty_users(dct):
    new_dct = {}
    for user, tweet_list in dct.items():
        if tweet_list != []:
            new_dct[user] = tweet_list
    return new_dct

def get_weekend_specific_tweets(dct):
    weekend_1_dct = {}
    weekend_2_dct = {}
    for user, tweet_list in dct.items():
        new_tweet_list_weekend_1 = []
        new_tweet_list_weekend_2 = []
        for tweet in tweet_list:
            tweet_time = datetime.datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S %z %Y')
            if coachella_end_date_weekend1 >= tweet_time:
                new_tweet_list_weekend_1.append(tweet)
            elif coachella_start_date_weekend2 <= tweet_time:
                new_tweet_list_weekend_2.append(tweet)
            else:
                continue
                
        weekend_1_dct[user] = new_tweet_list_weekend_1
        weekend_2_dct[user] = new_tweet_list_weekend_2
        
    return weekend_1_dct, weekend_2_dct

def open_file(file_name):
    with open(os.path.join(folder, file_name), mode = 'r', encoding = 'utf8') as reader:
        dct = json.load(reader)
    return dct

def save_tweet_dict(dct, label):
    file_suffix = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'coachella_tweets_weekend_{label} ' + file_suffix + '.json'
    filepath = os.path.join(folder, filename)
    with open(filepath, mode='w', encoding = 'utf-8') as f:
        json.dump(dct, f)


In [27]:
for f in get_files:
    weekend_1_dct, weekend_2_dct = get_weekend_specific_tweets(clear_empty_users(open_file(f)))
    save_tweet_dict(clear_empty_users(weekend_1_dct), '1')
    save_tweet_dict(clear_empty_users(weekend_2_dct), '2')
    print(f) #for progress report


coachella_tweets20191107_210240.json
coachella_tweets20191107_210514.json
coachella_tweets20191107_210706.json
coachella_tweets20191107_211218.json
coachella_tweets20191107_214356.json
coachella_tweets20191107_214639.json
coachella_tweets20191107_215132.json
coachella_tweets20191107_220147.json
coachella_tweets20191107_222255.json
coachella_tweets20191107_222517.json
coachella_tweets20191107_223029.json
coachella_tweets20191107_230148.json
coachella_tweets20191107_230342.json
coachella_tweets20191107_230919.json
coachella_tweets20191107_234110.json
coachella_tweets20191107_234121.json
coachella_tweets20191107_234730.json
coachella_tweets20191108_001926.json
coachella_tweets20191108_001931.json
coachella_tweets20191108_002626.json
coachella_tweets20191108_005654.json
coachella_tweets20191108_005715.json
coachella_tweets20191108_010503.json
coachella_tweets20191108_013444.json
coachella_tweets20191108_013529.json
coachella_tweets20191108_014248.json
coachella_tweets20191108_021248.json
c

4. Now we need to group the all the mini datasets into a single master json file (also eliminates any accidental user duplicates that may have occured during the get_tweet portion)

In [61]:
def save_master_tweet_dict(dct):
    #file_suffix = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    #filename = f'coachella_tweets_weekend_{label} ' + file_suffix + '.json'
    filepath = os.path.join(folder, 'coachella_tweets_master_1.json')
    with open(filepath, mode='w', encoding = 'utf-8') as f:
        json.dump(dct, f)

def merge_datasets(master_file, file_to_add):
    master = open_file(master_file)
    add = open_file(file_to_add)
    new_master = {**master, **add}
    save_master_tweet_dict(new_master, )
    print(f, sep = '', end='')
    

In [35]:
master = open_file('coachella_tweets_master.json')

In [36]:
add = open_file('coachella_tweets_weekend_2 20191107_195720.json')

In [40]:
new_master = {**master, **add}

In [38]:
type(add)

dict

In [44]:
folder = os.path.join('C:/', 'users', 'charr', 'documents', 'Weekend 2') #weekend 2 folder

get_files = [f for f in listdir(folder) if isfile(join(folder, f))]


#folder = os.path.join('C:/', 'users', 'charr', 'documents', 'Weekend 1') #weekend 2 folder
#merge_datasets('coachella_tweets_master.json', [f for f in listdir(folder) if isfile(join(folder, f))])


In [45]:
for f in get_files:
    merge_datasets('coachella_tweets_master.json', f)

.............................................................................................................

KeyboardInterrupt: 

From here we have our base datasets for both weekends that we can really begin to preprocess!!

In [46]:
done = '.............................................................................................................'

In [47]:
len(done)

109

In [54]:
get_files_2 = get_files[109:]

In [55]:
len(get_files_2)

80

In [56]:
get_files_3 = get_files_2[:40]

In [57]:
get_files_2 = get_files_2[40:]

In [60]:
get_files_2

['coachella_tweets_weekend_2 20191108_193454.json',
 'coachella_tweets_weekend_2 20191108_193510.json',
 'coachella_tweets_weekend_2 20191108_193522.json',
 'coachella_tweets_weekend_2 20191108_193535.json',
 'coachella_tweets_weekend_2 20191108_193549.json',
 'coachella_tweets_weekend_2 20191108_193606.json',
 'coachella_tweets_weekend_2 20191108_193620.json',
 'coachella_tweets_weekend_2 20191108_193637.json',
 'coachella_tweets_weekend_2 20191108_193654.json',
 'coachella_tweets_weekend_2 20191108_193712.json',
 'coachella_tweets_weekend_2 20191108_193727.json',
 'coachella_tweets_weekend_2 20191108_193743.json',
 'coachella_tweets_weekend_2 20191108_193758.json',
 'coachella_tweets_weekend_2 20191108_193813.json',
 'coachella_tweets_weekend_2 20191108_193829.json',
 'coachella_tweets_weekend_2 20191108_193846.json',
 'coachella_tweets_weekend_2 20191108_193903.json',
 'coachella_tweets_weekend_2 20191108_193919.json',
 'coachella_tweets_weekend_2 20191108_193935.json',
 'coachella_

In [59]:
get_files_3

['coachella_tweets_weekend_2 20191107_203604.json',
 'coachella_tweets_weekend_2 20191107_203620.json',
 'coachella_tweets_weekend_2 20191107_203633.json',
 'coachella_tweets_weekend_2 20191107_203649.json',
 'coachella_tweets_weekend_2 20191107_203704.json',
 'coachella_tweets_weekend_2 20191107_203719.json',
 'coachella_tweets_weekend_2 20191107_203735.json',
 'coachella_tweets_weekend_2 20191107_203749.json',
 'coachella_tweets_weekend_2 20191107_203805.json',
 'coachella_tweets_weekend_2 20191108_192651.json',
 'coachella_tweets_weekend_2 20191108_192707.json',
 'coachella_tweets_weekend_2 20191108_192722.json',
 'coachella_tweets_weekend_2 20191108_192737.json',
 'coachella_tweets_weekend_2 20191108_192751.json',
 'coachella_tweets_weekend_2 20191108_192805.json',
 'coachella_tweets_weekend_2 20191108_192821.json',
 'coachella_tweets_weekend_2 20191108_192838.json',
 'coachella_tweets_weekend_2 20191108_192855.json',
 'coachella_tweets_weekend_2 20191108_192909.json',
 'coachella_

In [65]:
for f in get_files_2:
    merge_datasets('coachella_tweets_master_1.json', f)

coachella_tweets_weekend_2 20191108_193454.jsoncoachella_tweets_weekend_2 20191108_193510.jsoncoachella_tweets_weekend_2 20191108_193522.jsoncoachella_tweets_weekend_2 20191108_193535.jsoncoachella_tweets_weekend_2 20191108_193549.jsoncoachella_tweets_weekend_2 20191108_193606.jsoncoachella_tweets_weekend_2 20191108_193620.jsoncoachella_tweets_weekend_2 20191108_193637.jsoncoachella_tweets_weekend_2 20191108_193654.jsoncoachella_tweets_weekend_2 20191108_193712.jsoncoachella_tweets_weekend_2 20191108_193727.jsoncoachella_tweets_weekend_2 20191108_193743.jsoncoachella_tweets_weekend_2 20191108_193758.jsoncoachella_tweets_weekend_2 20191108_193813.jsoncoachella_tweets_weekend_2 20191108_193829.jsoncoachella_tweets_weekend_2 20191108_193846.jsoncoachella_tweets_weekend_2 20191108_193903.jsoncoachella_tweets_weekend_2 20191108_193919.jsoncoachella_tweets_weekend_2 20191108_193935.jsoncoachella_tweets_weekend_2 20191108_193951.jsoncoachella_tweets_weekend_2 20191108_194007.jsoncoachella_twe

KeyboardInterrupt: 