# Project 5


In [8]:
import json
import datetime
import pandas as pd
import numpy as np
import pytz
pst_tz = pytz.timezone('America/Los_Angeles')


import warnings
warnings.filterwarnings("ignore")

In [16]:
def tweet_to_df(tweet_d):
    df = pd.DataFrame(tweet_d, columns=['ID', 'Title', 'nFollowers', 'nReTweet','Ranking Score','impressions','TimeStamp'])
    df.sort_values(by='TimeStamp',inplace = True)
    return df

In [10]:
#extract Tweets

def extract_tweets_into_df(file):
    d = []
    for line in open(file):
        temp = []
        t = json.loads(line)
        temp.append(t['tweet']['id'])
        temp.append(t['title'])
        #temp.append(t['user'])
        temp.append(t['author']['followers'])
        temp.append(t['metrics']['citations']['total'])
        temp.append(t['metrics']['ranking_score'])
        temp.append(t['metrics']['impressions'])
        temp.append(datetime.datetime.fromtimestamp(t['citation_date'], pst_tz))
        d.append(temp)
    return tweet_to_df(d)



In [11]:
def get_average_tweet(df):
    tweets_by_hour = df.set_index('TimeStamp').groupby(pd.Grouper(freq='60Min'))

    count_by_hour = [len(val) for key, val in tweets_by_hour]
    total_hours = len(tweets_by_hour)
    return len(df) / total_hours
    
    

In [41]:
def get_X_y(df,window):
    num_features = 5
    
    tweets_by_hour = df.set_index('TimeStamp').groupby(pd.Grouper(freq='60Min'))
    
    total_hours = len(tweets_by_hour)

    X = np.zeros((total_hours, 5))
    for i, (key, val) in enumerate(tweets_by_hour):
            features = [len(val), val.nReTweet.sum(), val.nFollowers.sum(), val.nFollowers.max(), key.hour]
            X[i, :] = features
            
    y = X[:, 0][window:]
    # number of tweets is the output as well as the first feature - but have to shift by #window hours
    X = np.nan_to_num(X)
    X_window = np.zeros((total_hours - window, num_features * window))
    
    for i in range(total_hours - window):
        X_window[i, :] = np.concatenate([X[i+k, :] for k in range(window)])
        
    X_window = np.nan_to_num(X_window)
    return X_window, y
    

In [17]:
nfl_df = extract_tweets_into_df("data/tweets_#nfl.txt")
nfl_df.head()

Unnamed: 0,ID,Title,nFollowers,nReTweet,Ranking Score,impressions,TimeStamp
40,555273124673900545,#NFL #sweatshirts BRAND NEW NFL ADULT PITTSBUR...,3050.0,1,6.920297,3055,2015-01-14 00:00:04-08:00
41,555273126972383232,"Kid Writes Letter to all 32 NFL Teams, Only On...",145.0,1,4.044162,143,2015-01-14 00:00:04-08:00
42,555273167934337025,Premium Game Tickets for the NFL Wembley games...,3457.0,3,7.512117,3836,2015-01-14 00:00:14-08:00
43,555273550655811584,"Kid Writes Letter to all 32 NFL Teams, Only On...",10658.0,2,3.886465,12096,2015-01-14 00:01:45-08:00
44,555274347745914880,♦⌂ SEATTLE #SEAHAWKS 2-TONE #NFL TEAM BREAKAWA...,833.0,1,3.945911,854,2015-01-14 00:04:55-08:00


In [18]:
sb_df = extract_tweets_into_df("data/tweets_#superbowl.txt")
#sb_df.head()

In [19]:
sb49_df = extract_tweets_into_df("data/tweets_#sb49.txt")
#sb49_df.head()


In [20]:
pat_df = extract_tweets_into_df("data/tweets_#patriots.txt")
#pat_df.head()

In [21]:
hawks_df = extract_tweets_into_df("data/tweets_#gohawks.txt")
#hawks_df.head()

In [22]:
go_pat_df = extract_tweets_into_df("data/tweets_#gopatriots.txt")
#go_pat_df.head()

In [23]:
nfl_avg_tweet = get_average_tweet(nfl_df)
sb_avg_tweet = get_average_tweet(sb_df)
sb49_avg_tweet = get_average_tweet(sb49_df)
pat_avg_tweet = get_average_tweet(pat_df)
gohawks_avg_tweet = get_average_tweet(hawks_df)
gopat_avg_tweet = get_average_tweet(go_pat_df)
print("Average # tweet for NFL = %f"%nfl_avg_tweet)
print("Average # tweet for SB = %f"%sb_avg_tweet)
print("Average # tweet for SB 49 = %f"%sb49_avg_tweet)
print("Average # tweet for Patriot = %f"%pat_avg_tweet)
print("Average # tweet for Go Hawks = %f"%gohawks_avg_tweet)
print("Average # tweet for Go Patriot = %f"%gopat_avg_tweet)

Average # tweet for NFL = 396.971039
Average # tweet for SB = 2067.824532
Average # tweet for SB 49 = 1275.555746
Average # tweet for Patriot = 750.632027
Average # tweet for Go Hawks = 292.093264
Average # tweet for Go Patriot = 40.888696


In [25]:
frame = [nfl_df,sb_df,sb49_df,pat_df,hawks_df,go_pat_df]
#aggregated df
df = pd.concat(frame)



In [39]:
min_time = df.set_index('TimeStamp').index.min()
max_time = df.set_index('TimeStamp').index.max()
print (min_time)
print (max_time)

total_hours = (max_time - min_time)
total_hours = total_hours.to_timedelta64().astype('timedelta64[h]') + 1
print(total_hours)

2015-01-14 00:00:04-08:00
2015-02-07 10:55:36-08:00
587 hours


In [42]:
X,Y = get_X_y(df, 1)

In [44]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

param_grid = {"n_estimators": [200, 400, 600, 800, 1000,1200, 1400, 1600, 1800, 2000],
    "max_depth": [10, 20, 40, 60, 80, 100, 200, None],
    "max_features": ['auto', 'sqrt'],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [2, 5, 10]
}

model = RandomForestRegressor(random_state=0)

In [49]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(5, shuffle=True),scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

print(grid.best_score_)

-294029187.03
{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}


In [None]:
{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}

In [50]:
regressor = RandomForestRegressor(random_state=0, n_estimators=1000, max_depth=20, max_features='sqrt', 
                                  min_samples_leaf=2, min_samples_split=5)
regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [51]:
regressor.score(X_test, y_test) 

0.83614297785463843

In [52]:
y_pred = regressor.predict(X_test)

In [58]:
from sklearn import metrics
mse = metrics.mean_squared_error(y_test, y_pred)
print("MSE: %.4f" % mse) 

MSE: 98875098.0574


In [60]:
from sklearn.ensemble import GradientBoostingRegressor

GBmodel = GradientBoostingRegressor(random_state=0)
GBgrid = GridSearchCV(estimator=GBmodel, param_grid=param_grid, cv=KFold(5, shuffle=True),scoring='neg_mean_squared_error')
GBgrid.fit(X_train, y_train)


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=0,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [61]:
print(GBgrid.best_score_)
print(GBgrid.best_params_)

-405275402.696
{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 200}


In [63]:
GBregressor = GradientBoostingRegressor(random_state=0, n_estimators=200, max_depth=20, max_features='sqrt', 
                                  min_samples_leaf=10, min_samples_split=2)
GBregressor.fit(X_train, y_train)
GBregressor.score(X_test, y_test)


0.67733631277709594

In [65]:
y_pred = GBregressor.predict(X_test)

In [66]:
mse = metrics.mean_squared_error(y_test, y_pred)
print("MSE: %.4f" % mse)

MSE: 194702694.4346
