In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import sys
import json
from datetime import datetime
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# training and Testing directories
training_dir = os.path.join("Datasets", "Training")
testing_dir = os.path.join("Datasets", "Testing")
if not os.path.isdir(training_dir):
    raise Exception("ERROR: training dataset not found")
if not os.path.isdir(testing_dir):
    raise Exception("ERROR: testing dataset not found")

## Part 1: Analyzing the data

Size/RAM experiments: loading all training as lists requires approximately 10.3 GB of RAM. It is suggested to only extract the features you need each time

In [2]:
# iterate over all hashtag files 
for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        print(os.path.join(root, file))

Datasets\Training\tweets_#gohawks.txt
Datasets\Training\tweets_#gopatriots.txt
Datasets\Training\tweets_#nfl.txt
Datasets\Training\tweets_#patriots.txt
Datasets\Training\tweets_#sb49.txt
Datasets\Training\tweets_#superbowl.txt


##### Question 1: Get statistics

In [6]:
# iterate over all hashtag files 
for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        # the name of the key
        filename = os.path.splitext(file)[0].replace('tweets_#', '')
        print('Parsing {}...'.format(filename))
        
        # only extracting specific features from the tweet json objects
        citation_dates = []
        followers = []
        retweets = []
        
        # open the file and read all lines:
        with open(os.path.join(root, file), "r", encoding="utf-8") as hashtag:
            # read line-by-line
            for line in hashtag:
                json_obj = json.loads(line)
                
                # get citation date
                citation_date = json_obj['citation_date']
                citation_dates.append(citation_date)
                
                # get number of followers for the author of eath tweet
                follower_count = json_obj['author']['followers']
                followers.append(follower_count)
                
                # get retweets
                retweet_total = json_obj['metrics']['citations']['total']
                retweets.append(retweet_total)
        
        # processing citation feature
        print('\t'+'-'*10)
        citation_dates = np.array(citation_dates)
        print('\tnumber of tweets in period: {}'.format(len(citation_dates)))
        min_date = np.min(citation_dates)
        max_date = np.max(citation_dates)
        
        span_hours = (max_date - min_date)/60
        print('\tnumber of hours in period: {}'.format(span_hours))
        
        tweets_per_hour = len(citation_dates) / span_hours
        print('\taverage tweets per hour: {}'.format(tweets_per_hour))
        
        # processing followers feature
        print('\t'+'-'*10)
        followers = np.array(followers)
        follower_avg = np.average(followers)
        print('\taverage followers per tweeter: {}'.format(follower_avg))
        
        # processing retweets feature
        print('\t'+'-'*10)
        retweets = np.array(retweets)
        retweet_avg = np.average(retweets)
        print('\taverage retweets per tweet: {}'.format(retweet_avg))
        
        print('\t'+'-'*10)
        print('')

Parsing gohawks...
	----------
	number of tweets in period: 169122
	number of hours in period: 34693.13333333333
	average tweets per hour: 4.874797510362281
	----------
	average followers per tweeter: 2217.9237355281984
	----------
	average retweets per tweet: 2.0132093991319877
	----------

Parsing gopatriots...
	----------
	number of tweets in period: 23511
	number of hours in period: 34444.4
	average tweets per hour: 0.6825783001010324
	----------
	average followers per tweeter: 1427.2526051635405
	----------
	average retweets per tweet: 1.4081919101697078
	----------

Parsing nfl...
	----------
	number of tweets in period: 233022
	number of hours in period: 35215.53333333333
	average tweets per hour: 6.617023169699735
	----------
	average followers per tweeter: 4662.37544523693
	----------
	average retweets per tweet: 1.5344602655543254
	----------

Parsing patriots...
	----------
	number of tweets in period: 440621
	number of hours in period: 35207.7
	average tweets per hour: 12.5