# Q10 Before Superbowl

In [6]:
import json
import datetime
import pandas as pd
import numpy as np
import pytz
pst_tz = pytz.timezone('America/Los_Angeles')


import warnings
warnings.filterwarnings("ignore")

In [7]:
def get_X_y(df,window):
    num_features = 5
    
    tweets_by_hour = df.set_index('TimeStamp').groupby(pd.Grouper(freq='60Min'))
    
    total_hours = len(tweets_by_hour)

    X = np.zeros((total_hours, 5))
    for i, (key, val) in enumerate(tweets_by_hour):
            features = [len(val), val.nReTweet.sum(), val.nFollowers.sum(), val.nFollowers.max(), key.hour]
            X[i, :] = features
            
    y = X[:, 0][window:]
    # number of tweets is the output as well as the first feature - but have to shift by #window hours
    X = np.nan_to_num(X)
    X_window = np.zeros((total_hours - window, num_features * window))
    
    for i in range(total_hours - window):
        X_window[i, :] = np.concatenate([X[i+k, :] for k in range(window)])
        
    X_window = np.nan_to_num(X_window)
    return X_window, y

In [33]:
piece_time = datetime.datetime(2015, 2, 1, 8, 0).timestamp()
piece_time_pst = datetime.datetime.fromtimestamp(piece_time, pst_tz)
piece_time_2 = datetime.datetime(2015, 2, 1, 20, 0).timestamp()
piece_time_2_pst = datetime.datetime.fromtimestamp(piece_time_2, pst_tz)

In [10]:
def tweet_to_df(tweet_d):
    df = pd.DataFrame(tweet_d, columns=['ID', 'Title', 'nFollowers', 'nReTweet','Ranking Score','impressions','TimeStamp'])
    df.sort_values(by='TimeStamp',inplace = True)
    return df

In [9]:
def extract_tweets_into_df(file):
    d = []
    for line in open(file):
        temp = []
        t = json.loads(line)
        time = datetime.datetime.fromtimestamp(t['citation_date'], pst_tz)
        if(time < piece_time_pst ):
            temp.append(t['tweet']['id'])
            temp.append(t['title'])
            #temp.append(t['user'])
            temp.append(t['author']['followers'])
            temp.append(t['metrics']['citations']['total'])
            temp.append(t['metrics']['ranking_score'])
            temp.append(t['metrics']['impressions'])
            temp.append(datetime.datetime.fromtimestamp(t['citation_date'], pst_tz))
            d.append(temp)
    return tweet_to_df(d)



In [11]:
nfl_df = extract_tweets_into_df("data/tweets_#nfl.txt")

In [12]:
sb_df = extract_tweets_into_df("data/tweets_#superbowl.txt")

In [13]:
sb49_df = extract_tweets_into_df("data/tweets_#sb49.txt")

In [14]:
pat_df = extract_tweets_into_df("data/tweets_#patriots.txt")

In [15]:
hawks_df = extract_tweets_into_df("data/tweets_#gohawks.txt")

In [16]:
go_pat_df = extract_tweets_into_df("data/tweets_#gopatriots.txt")

In [17]:
frame = [nfl_df,sb_df,sb49_df,pat_df,hawks_df,go_pat_df]
#aggregated df
df = pd.concat(frame)

In [23]:
min_time = df.set_index('TimeStamp').index.min()
max_time = df.set_index('TimeStamp').index.max()
print (min_time)
print (max_time)

total_hours = (max_time - min_time)
total_hours = total_hours.to_timedelta64().astype('timedelta64[h]') + 1
print(total_hours)

2015-01-14 00:00:04-08:00
2015-02-01 07:59:59-08:00
440 hours


In [19]:
X,Y = get_X_y(df, 1)

In [20]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

param_grid = {"n_estimators": [200, 400, 600, 800, 1000,1200, 1400, 1600, 1800, 2000],
    "max_depth": [10, 20, 40, 60, 80, 100, 200, None],
    "max_features": ['auto', 'sqrt'],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [2, 5, 10]
}

model = GradientBoostingRegressor(random_state=0)

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [24]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(5, shuffle=True),scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=0,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [27]:
print(grid.best_score_)
print(grid.best_params_)

-4428349.08819
{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 200}


In [28]:
GBregressor = GradientBoostingRegressor(random_state=0, n_estimators=200, max_depth=10, max_features='auto', 
                                  min_samples_leaf=10, min_samples_split=2)
GBregressor.fit(X_train, y_train)
GBregressor.score(X_test, y_test)

0.75166805142689175

In [30]:
y_pred = GBregressor.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("MSE: %.4f" % mse)

MSE: 2090700.0147


# Aggregated Data - During Super Bowl (5 mins window)

In [31]:
def extract_tweets_into_df(file):
    d = []
    for line in open(file):
        temp = []
        t = json.loads(line)
        time = datetime.datetime.fromtimestamp(t['citation_date'], pst_tz)
        if(time > piece_time_pst and time < piece_time_2_pst):
            temp.append(t['tweet']['id'])
            temp.append(t['title'])
            #temp.append(t['user'])
            temp.append(t['author']['followers'])
            temp.append(t['metrics']['citations']['total'])
            temp.append(t['metrics']['ranking_score'])
            temp.append(t['metrics']['impressions'])
            temp.append(datetime.datetime.fromtimestamp(t['citation_date'], pst_tz))
            d.append(temp)
    return tweet_to_df(d)



In [34]:
nfl_df = extract_tweets_into_df("data/tweets_#nfl.txt")

In [35]:
sb_df = extract_tweets_into_df("data/tweets_#superbowl.txt")
sb49_df = extract_tweets_into_df("data/tweets_#sb49.txt")
pat_df = extract_tweets_into_df("data/tweets_#patriots.txt")
hawks_df = extract_tweets_into_df("data/tweets_#gohawks.txt")
go_pat_df = extract_tweets_into_df("data/tweets_#gopatriots.txt")


In [36]:
frame = [nfl_df,sb_df,sb49_df,pat_df,hawks_df,go_pat_df]
#aggregated df
df = pd.concat(frame)

min_time = df.set_index('TimeStamp').index.min()
max_time = df.set_index('TimeStamp').index.max()
print (min_time)
print (max_time)

total_hours = (max_time - min_time)
total_hours = total_hours.to_timedelta64().astype('timedelta64[h]') + 1
print(total_hours)

2015-02-01 08:00:01-08:00
2015-02-01 19:59:59-08:00
12 hours


In [37]:
X,Y = get_X_y(df, 5)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)


In [39]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(5, shuffle=True),scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)




GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=0,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [40]:
print(grid.best_score_)
print(grid.best_params_)


-7868149821.4
{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}


In [41]:
GBregressor = GradientBoostingRegressor(random_state=0, n_estimators=1000, max_depth=10, max_features='sqrt', 
                                  min_samples_leaf=2, min_samples_split=2)
GBregressor.fit(X_train, y_train)
GBregressor.score(X_test, y_test)

-0.93462157095302656

In [42]:
y_pred = GBregressor.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("MSE: %.4f" % mse)

MSE: 8613113559.5608


# After Super Bowl 

In [44]:
def extract_tweets_into_df(file):
    d = []
    for line in open(file):
        temp = []
        t = json.loads(line)
        time = datetime.datetime.fromtimestamp(t['citation_date'], pst_tz)
        if(time > piece_time_2_pst):
            temp.append(t['tweet']['id'])
            temp.append(t['title'])
            #temp.append(t['user'])
            temp.append(t['author']['followers'])
            temp.append(t['metrics']['citations']['total'])
            temp.append(t['metrics']['ranking_score'])
            temp.append(t['metrics']['impressions'])
            temp.append(datetime.datetime.fromtimestamp(t['citation_date'], pst_tz))
            d.append(temp)
    return tweet_to_df(d)

In [45]:
nfl_df = extract_tweets_into_df("data/tweets_#nfl.txt")
sb_df = extract_tweets_into_df("data/tweets_#superbowl.txt")
sb49_df = extract_tweets_into_df("data/tweets_#sb49.txt")
pat_df = extract_tweets_into_df("data/tweets_#patriots.txt")
hawks_df = extract_tweets_into_df("data/tweets_#gohawks.txt")
go_pat_df = extract_tweets_into_df("data/tweets_#gopatriots.txt")

In [46]:
frame = [nfl_df,sb_df,sb49_df,pat_df,hawks_df,go_pat_df]
#aggregated df
df = pd.concat(frame)

min_time = df.set_index('TimeStamp').index.min()
max_time = df.set_index('TimeStamp').index.max()
print (min_time)
print (max_time)

total_hours = (max_time - min_time)
total_hours = total_hours.to_timedelta64().astype('timedelta64[h]') + 1
print(total_hours)

2015-02-01 20:00:01-08:00
2015-02-07 10:55:36-08:00
135 hours


In [49]:
X,Y = get_X_y(df, 1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(5, shuffle=True),scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=0,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [48]:
print(grid.best_score_)
print(grid.best_params_)



-394456.209
{'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 200}


In [50]:
GBregressor = GradientBoostingRegressor(random_state=0, n_estimators=200, max_depth=20, max_features='auto', 
                                  min_samples_leaf=5, min_samples_split=2)
GBregressor.fit(X_train, y_train)
GBregressor.score(X_test, y_test)

0.92813335568695532

In [51]:
y_pred = GBregressor.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("MSE: %.4f" % mse)

MSE: 211126.2024
