In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import sys
import json
from datetime import datetime
from tqdm import tqdm
import pytz
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# training and Testing directories
training_dir = os.path.join("Datasets", "Training")
testing_dir = os.path.join("Datasets", "Testing")
if not os.path.isdir(training_dir):
    raise Exception("ERROR: training dataset not found")
if not os.path.isdir(testing_dir):
    raise Exception("ERROR: testing dataset not found")

In [3]:
# iterate over all hashtag files 
for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        print(os.path.join(root, file))

Datasets/Training/tweets_#gohawks.txt
Datasets/Training/tweets_#gopatriots.txt
Datasets/Training/tweets_#nfl.txt
Datasets/Training/tweets_#patriots.txt
Datasets/Training/tweets_#sb49.txt
Datasets/Training/tweets_#superbowl.txt


In [4]:
# Initialize dictionaries and Unix times for Feb 1, 8 am and Feb 1, 8 pm. 
# Dictionary keys: hashtag.
# Dictionary values: [time of tweet (Unix), number of retweets for tweet, number of followers for tweeter]

hashtag_dict_before = {}
hashtag_dict_during = {}
hashtag_dict_after = {}
start_unix_time = 1422806400
end_unix_time = 1422849600
pst_tz = pytz.timezone('America/Los_Angeles')


In [5]:
for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        filename = os.path.splitext(file)[0].replace('tweets_#', '')
        print('Parsing {}...'.format(filename))
        
        hashtag_dict_before[filename] = []
        hashtag_dict_during[filename] = []
        hashtag_dict_after[filename] = []
        
        # open the file and read all lines:
        with open(os.path.join(root, file), "r", encoding="utf-8") as hashtag:
            # read line-by-line
            for line in hashtag:
                json_obj = json.loads(line)
                
                # get desired statistics
                citation_date = json_obj['citation_date'] # Unix time
                num_retweets = json_obj['metrics']['citations']['total'] # Number of retweets for this tweet
                num_followers = json_obj['author']['followers'] # Number of followers for tweeter
                
                if citation_date < start_unix_time:
                    hashtag_dict_before[filename].append([citation_date, num_retweets, num_followers])
                elif citation_date > end_unix_time:
                    hashtag_dict_after[filename].append([citation_date, num_retweets, num_followers])
                else:
                    hashtag_dict_during[filename].append([citation_date, num_retweets, num_followers])
    print('done')

Parsing gohawks...
Parsing gopatriots...
Parsing nfl...
Parsing patriots...
Parsing sb49...
Parsing superbowl...
done


In [6]:
# Sort each by time.

hashtags = ['gohawks', 'gopatriots', 'nfl', 'patriots', 'sb49', 'superbowl']

for key in hashtags:
    hashtag_dict_before[key] = np.array(hashtag_dict_before[key])
#     indices = np.argsort(hashtag_dict_before[key][:,0])
#     hashtag_dict_before[key] = hashtag_dict_before[key][indices]
    
    hashtag_dict_during[key] = np.array(hashtag_dict_during[key])
#     indices = np.argsort(hashtag_dict_during[key][:,0])
#     hashtag_dict_during[key] = hashtag_dict_during[key][indices]

    hashtag_dict_after[key] = np.array(hashtag_dict_after[key])
#     indices = np.argsort(hashtag_dict_after[key][:,0])
#     hashtag_dict_after[key] = hashtag_dict_after[key][indices]


In [7]:
# Find how many time windows there are

ftt = int(np.min([np.min(hashtag_dict_before[key][:,0]) for key in hashtags])) # first tweet time
ltt = int(np.max([np.max(hashtag_dict_after[key][:,0]) for key in hashtags])) # last tweet time

num_windows_before = int(np.max([((start_unix_time - ftt) // 3600) + 1 for key in hashtags]))
num_windows_during = int(np.max([((end_unix_time - start_unix_time) // 3600 * 12) for key in hashtags]))
num_windows_after = int(np.max([((ltt - end_unix_time) // 3600) + 1 for key in hashtags]))


In [8]:
data_hashtag_before = {}
data_hashtag_during = {}
data_hashtag_after = {}

for key in hashtags:
    print(key)
    
    # Rename the dictionary value for readability
    temp_before = hashtag_dict_before[key]
    temp_during = hashtag_dict_during[key]
    temp_after = hashtag_dict_after[key]
        
    # Iterate through all elements before start time
    data_hashtag_before[key] = np.zeros((num_windows_before, 5))
    num_followers_before = {}
    for i in range(np.shape(temp_before)[0]):
        item_before = int(num_windows_before - 1 - ((start_unix_time - temp_before[i,0] - 1) // 3600))
        data_hashtag_before[key][item_before] += np.array([1, int(temp_before[i, 1]), int(temp_before[i, 2]), 0, 0])
#         data_hashtag_before[key][item_before][3] = np.max(temp_before[:,2])
        dt_obj_pst = datetime.fromtimestamp(temp_before[i,0], pst_tz)
        data_hashtag_before[key][item_before][4] = int(datetime.strftime(dt_obj_pst, '%H'))
        
        if item_before not in num_followers_before.keys():
            num_followers_before[item_before] = []
        num_followers_before[item_before].append(temp_before[i,2])
    for i in num_followers_before.keys():
        data_hashtag_before[key][i][3] = np.max(num_followers_before[i])
        
    # Iterate through all elements during time
    data_hashtag_during[key] = np.zeros((num_windows_during, 5))
    num_followers_during = {}
    for i in range(np.shape(temp_during)[0]):
        item_during = int(((temp_during[i,0] - start_unix_time) * 12) // 3600)
        data_hashtag_during[key][item_during] += np.array([1, int(temp_during[i, 1]), int(temp_during[i, 2]), 0, 0])
#         data_hashtag_during[key][item_during][3] = np.max(temp_during[:,2])
        dt_obj_pst = datetime.fromtimestamp(temp_during[i,0], pst_tz)
        data_hashtag_during[key][item_during][4] = int(datetime.strftime(dt_obj_pst, '%H'))
        
        if item_during not in num_followers_during.keys():
            num_followers_during[item_during] = []
        num_followers_during[item_during].append(temp_during[i,2])
    for i in num_followers_during.keys():
        data_hashtag_during[key][i][3] = np.max(num_followers_during[i])
        
    # Iterate through all elements after end time
    data_hashtag_after[key] = np.zeros((num_windows_after, 5))
    num_followers_after = {}
    for i in range(np.shape(temp_after)[0]):
        item_after = int((temp_after[i,0] - end_unix_time) // 3600)
        data_hashtag_after[key][item_after] += np.array([1, int(temp_after[i, 1]), int(temp_after[i, 2]), 0, 0])
#         data_hashtag_after[key][item_after][3] = np.max(temp_after[:,2])
        dt_obj_pst = datetime.fromtimestamp(temp_after[i,0], pst_tz)
        data_hashtag_after[key][item_after][4] = int(datetime.strftime(dt_obj_pst, '%H'))
        
        if item_after not in num_followers_after.keys():
            num_followers_after[item_after] = []
        num_followers_after[item_after].append(temp_after[i,2])
    for i in num_followers_after.keys():
        data_hashtag_after[key][i][3] = np.max(num_followers_after[i])
        
print('done')

gohawks
gopatriots
nfl
patriots
sb49
superbowl
done


In [12]:
# Aggregate data

# Initialize aggregated data variables
data_aggregate_before = np.zeros([num_windows_before, 5])
data_aggregate_during = np.zeros([num_windows_during, 5])
data_aggregate_after = np.zeros([num_windows_after, 5])

# Sum the # of tweets, total # of retweets, and # of followers
for key in hashtags:
    data_aggregate_before[:,0:3] += data_hashtag_before[key][:,0:3]
    data_aggregate_during[:,0:3] += data_hashtag_during[key][:,0:3]
    data_aggregate_after[:,0:3] += data_hashtag_after[key][:,0:3]
# Find the max # of followers for each
data_aggregate_before[:,3] = np.amax([data_hashtag_before[key][:,3] for key in hashtags], axis=0)
data_aggregate_during[:,3] = np.amax([data_hashtag_during[key][:,3] for key in hashtags], axis=0)
data_aggregate_after[:,3] = np.amax([data_hashtag_after[key][:,3] for key in hashtags], axis=0)

# Copy over the same time frames
data_aggregate_before[:,4] = data_hashtag_before['superbowl'][:,4]
data_aggregate_during[:,4] = data_hashtag_during['superbowl'][:,4]
data_aggregate_after[:,4] = data_hashtag_after['superbowl'][:,4]

### Neural Network

In [14]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [15]:
layer_sizes = [(50, 50), (100, 100), (100, 100, 100), (100, 100, 100, 100), 10*(50,), 10*(100,)]

In [26]:
def analyze_nn(nn, X, y, layers):
    mses_per_layer = []
    kf = KFold(10)
    for trainset, testset in kf.split(X):
        X_train, y_train = X[trainset], y[trainset]
        X_test, y_test = X[testset], y[testset]
        nn.fit(X_train, y_train)
        predicted = nn.predict(X_test)
        mses_per_layer.append(mean_squared_error(y_test, predicted))
    #     print(mean_squared_error(y_test, predicted))
    avg_mse = np.mean(mses_per_layer)
    print('Layer size {} MSE:\n {}'.format(size, np.around(avg_mse, 2)))
    
    return avg_mse

#### No preprocessing

In [27]:
y_before = data_aggregate_before[1:,0]
X_before = np.delete(data_aggregate_before, -1, 0)

X = X_before
y = y_before

print('X shape:', X.shape)
print('y shape:', y.shape)

mses_before = []
for size in layer_sizes:
    nn = MLPRegressor(hidden_layer_sizes=size, activation='relu', solver='adam', alpha=0.001)

    avg_mse = analyze_nn(nn, X, y, size)
    mses_before.append(avg_mse)

X shape: (439, 5)
y shape: (439,)
Layer size (50, 50) MSE:
 98285603.12
Layer size (100, 100) MSE:
 12849881.22
Layer size (100, 100, 100) MSE:
 24222995.55
Layer size (100, 100, 100, 100) MSE:
 19350538.66
Layer size (50, 50, 50, 50, 50, 50, 50, 50, 50, 50) MSE:
 7834581.0
Layer size (100, 100, 100, 100, 100, 100, 100, 100, 100, 100) MSE:
 7180733.52


In [28]:
y_during = data_aggregate_during[1:,0]
X_during = np.delete(data_aggregate_during, -1, 0)

X = X_during
y = y_during

print('X shape:', X.shape)
print('y shape:', y.shape)

mses_during = []
for size in layer_sizes:
    nn = MLPRegressor(hidden_layer_sizes=size, activation='relu', solver='adam', alpha=0.001)

    avg_mse = analyze_nn(nn, X, y, size)
    mses_during.append(avg_mse)

X shape: (143, 5)
y shape: (143,)
Layer size (50, 50) MSE:
 18117620527339.48
Layer size (100, 100) MSE:
 102303823343.34
Layer size (100, 100, 100) MSE:
 486132967786.56
Layer size (100, 100, 100, 100) MSE:
 59044144675.32
Layer size (50, 50, 50, 50, 50, 50, 50, 50, 50, 50) MSE:
 183715723.0
Layer size (100, 100, 100, 100, 100, 100, 100, 100, 100, 100) MSE:
 176005234.96


In [29]:
y_after = data_aggregate_after[1:,0]
X_after = np.delete(data_aggregate_after, -1, 0)

X = X_after
y = y_after

print('X shape:', X.shape)
print('y shape:', y.shape)

mses_after = []
for size in layer_sizes:
    nn = MLPRegressor(hidden_layer_sizes=size, activation='relu', solver='adam', alpha=0.001)

    avg_mse = analyze_nn(nn, X, y, size)
    mses_after.append(avg_mse)

X shape: (134, 5)
y shape: (134,)
Layer size (50, 50) MSE:
 156130187198.86
Layer size (100, 100) MSE:
 11184102016.2
Layer size (100, 100, 100) MSE:
 36013255090.97
Layer size (100, 100, 100, 100) MSE:
 20519256630.1
Layer size (50, 50, 50, 50, 50, 50, 50, 50, 50, 50) MSE:
 5761544.24
Layer size (100, 100, 100, 100, 100, 100, 100, 100, 100, 100) MSE:
 17859847.26


In [30]:
print(np.sqrt(mses_before))
print(np.sqrt(mses_during))
print(np.sqrt(mses_after))

[9913.90957773 3584.67309839 4921.68625066 4398.92471675 2799.03215447
 2679.68907204]
[4256479.82813727  319849.68867163  697232.3628365   242990.00941463
   13554.17732642   13266.6964598 ]
[395133.12591943 105754.91485602 189771.58662711 143245.44191737
   2400.32169447   4226.09125118]


#### Standard scaler preprocessing

In [20]:
from sklearn.preprocessing import StandardScaler

In [31]:
scaler = StandardScaler()

In [37]:
X = scaler.fit_transform(X_before)
y = y_before

print('X shape:', X.shape)
print('y shape:', y.shape)

mses_before = []
for size in layer_sizes:
    nn = MLPRegressor(hidden_layer_sizes=size, activation='relu', solver='adam', alpha=0.001)

    avg_mse = analyze_nn(nn, X, y, size)
    mses_before.append(avg_mse)

X shape: (439, 5)
y shape: (439,)
Layer size (50, 50) MSE:
 6050233.73
Layer size (100, 100) MSE:
 5110956.82
Layer size (100, 100, 100) MSE:
 4751427.18
Layer size (100, 100, 100, 100) MSE:
 5020096.21
Layer size (50, 50, 50, 50, 50, 50, 50, 50, 50, 50) MSE:
 5819653.3
Layer size (100, 100, 100, 100, 100, 100, 100, 100, 100, 100) MSE:
 7057739.0


In [38]:
X = scaler.fit_transform(X_during)
y = y_during

print('X shape:', X.shape)
print('y shape:', y.shape)

mses_during = []
for size in layer_sizes:
    nn = MLPRegressor(hidden_layer_sizes=size, activation='relu', solver='adam', alpha=0.001)

    avg_mse = analyze_nn(nn, X, y, size)
    mses_during.append(avg_mse)

X shape: (143, 5)
y shape: (143,)
Layer size (50, 50) MSE:
 303747463.08
Layer size (100, 100) MSE:
 288660097.72
Layer size (100, 100, 100) MSE:
 80446124.47
Layer size (100, 100, 100, 100) MSE:
 52269559.79
Layer size (50, 50, 50, 50, 50, 50, 50, 50, 50, 50) MSE:
 27087474.1
Layer size (100, 100, 100, 100, 100, 100, 100, 100, 100, 100) MSE:
 27929247.19


In [39]:
X = scaler.fit_transform(X_after)
y = y_after

print('X shape:', X.shape)
print('y shape:', y.shape)

mses_after = []
for size in layer_sizes:
    nn = MLPRegressor(hidden_layer_sizes=size, activation='relu', solver='adam', alpha=0.001)

    avg_mse = analyze_nn(nn, X, y, size)
    mses_after.append(avg_mse)

X shape: (134, 5)
y shape: (134,)
Layer size (50, 50) MSE:
 5362373.3
Layer size (100, 100) MSE:
 3986058.6
Layer size (100, 100, 100) MSE:
 1768669.54
Layer size (100, 100, 100, 100) MSE:
 1441405.13
Layer size (50, 50, 50, 50, 50, 50, 50, 50, 50, 50) MSE:
 1078216.0
Layer size (100, 100, 100, 100, 100, 100, 100, 100, 100, 100) MSE:
 876148.24


In [40]:
print(np.sqrt(mses_before))
print(np.sqrt(mses_during))
print(np.sqrt(mses_after))

[2459.72228665 2260.74253704 2179.77686376 2240.55712    2412.39575917
 2656.64054704]
[17428.35227655 16989.99993298  8969.17635395  7229.76899966
  5204.56281528  5284.81288082]
[2315.67987834 1996.51160876 1329.91335651 1200.58532616 1038.3718009
  936.02790447]
