In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import sys
import json
from datetime import datetime
from tqdm import tqdm
import pytz
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Training and Testing directories
training_dir = os.path.join("Datasets", "Training")
testing_dir = os.path.join("Datasets", "Testing")
if not os.path.isdir(training_dir):
    raise Exception("ERROR: training dataset not found")
if not os.path.isdir(testing_dir):
    raise Exception("ERROR: testing dataset not found")

In [3]:
# iterate over all hashtag files 
for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        print(os.path.join(root, file))

Datasets\Training\tweets_#gohawks.txt
Datasets\Training\tweets_#gopatriots.txt
Datasets\Training\tweets_#nfl.txt
Datasets\Training\tweets_#patriots.txt
Datasets\Training\tweets_#sb49.txt
Datasets\Training\tweets_#superbowl.txt


In [4]:
# Initialize dictionaries and Unix times for Feb 1, 8 am and Feb 1, 8 pm. 
# Dictionary keys: hashtag.
# Dictionary values: [time of tweet (Unix), number of retweets for tweet, number of followers for tweeter]
# Each row in dictionary value is an individual tweet.

hashtag_dict_before = {}
hashtag_dict_during = {}
hashtag_dict_after = {}
start_unix_time = 1422806400 # 8 am, Feb 1, PST
end_unix_time = 1422849600 # 8 pm, Feb 1, PST
pst_tz = pytz.timezone('America/Los_Angeles')


In [5]:
for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        filename = os.path.splitext(file)[0].replace('tweets_#', '')
        print('Parsing {}...'.format(filename))
        
        hashtag_dict_before[filename] = []
        hashtag_dict_during[filename] = []
        hashtag_dict_after[filename] = []
        
        # open the file and read all lines:
        with open(os.path.join(root, file), "r", encoding="utf-8") as hashtag:
            # read line-by-line
            for line in hashtag:
                json_obj = json.loads(line)
                
                # get desired statistics
                citation_date = json_obj['citation_date'] # Unix time
                num_retweets = json_obj['metrics']['citations']['total'] # Number of retweets for this tweet
                num_followers = json_obj['author']['followers'] # Number of followers for tweeter
                
                # Check when tweet was made and add it to corresponding dictionary
                if citation_date < start_unix_time:
                    hashtag_dict_before[filename].append([citation_date, num_retweets, num_followers])
                elif citation_date > end_unix_time:
                    hashtag_dict_after[filename].append([citation_date, num_retweets, num_followers])
                else:
                    hashtag_dict_during[filename].append([citation_date, num_retweets, num_followers])
    print('done')

Parsing gohawks...
Parsing gopatriots...
Parsing nfl...
Parsing patriots...
Parsing sb49...
Parsing superbowl...
done


In [6]:
# Explicitly list hashtags. 
# Convert each value in dictionary to numpy arrays.

hashtags = ['gohawks', 'gopatriots', 'nfl', 'patriots', 'sb49', 'superbowl']

for key in hashtags:
    hashtag_dict_before[key] = np.array(hashtag_dict_before[key])
    hashtag_dict_during[key] = np.array(hashtag_dict_during[key])
    hashtag_dict_after[key] = np.array(hashtag_dict_after[key])

In [7]:
# Find how many time windows there are

ftt = int(np.min([np.min(hashtag_dict_before[key][:,0]) for key in hashtags])) # first tweet time
ltt = int(np.max([np.max(hashtag_dict_after[key][:,0]) for key in hashtags])) # last tweet time

num_windows_before = int(np.max([((start_unix_time - ftt) // 3600) + 1 for key in hashtags]))
num_windows_during = int(np.max([((end_unix_time - start_unix_time) // 3600 * 12) for key in hashtags]))
num_windows_after = int(np.max([((ltt - end_unix_time) // 3600) + 1 for key in hashtags]))


In [8]:
# Organize data into specific time frames.

# Initialize dictionary for each time frame.
data_hashtag_before = {}
data_hashtag_during = {}
data_hashtag_after = {}

# Iterate through each hashtag.
for key in hashtags:
    print(key)
    
    # Rename the dictionary value for readability
    temp_before = hashtag_dict_before[key]
    temp_during = hashtag_dict_during[key]
    temp_after = hashtag_dict_after[key]
    
    data_hashtag_before[key] = np.zeros((num_windows_before, 5)) # Initialize array: rows = time window, columns = feature
    num_followers_before = {} # Initialize dictionary to count # of followers for each tweet
    
    
    
    # Iterate through all elements before start time
    for i in range(np.shape(temp_before)[0]):
        # Get row number
        item_before = int(num_windows_before - 1 - ((start_unix_time - temp_before[i,0] - 1) // 3600))
        # Update first 3 elements (# of tweets, total # retweets, total # followers)
        data_hashtag_before[key][item_before] += np.array([1, int(temp_before[i, 1]), int(temp_before[i, 2]), 0, 0])
        # Get time of day (hour)
        dt_obj_pst = datetime.fromtimestamp(temp_before[i,0], pst_tz)
        data_hashtag_before[key][item_before][4] = int(datetime.strftime(dt_obj_pst, '%H'))
        # Get number of followers
        if item_before not in num_followers_before.keys():
            num_followers_before[item_before] = []
        num_followers_before[item_before].append(temp_before[i,2])
    for i in num_followers_before.keys():
        data_hashtag_before[key][i][3] = np.max(num_followers_before[i])
        
        
    # Iterate through all elements during time
    data_hashtag_during[key] = np.zeros((num_windows_during, 5))
    num_followers_during = {}
    for i in range(np.shape(temp_during)[0]):
        item_during = int(((temp_during[i,0] - start_unix_time) * 12) // 3600)
        data_hashtag_during[key][item_during] += np.array([1, int(temp_during[i, 1]), int(temp_during[i, 2]), 0, 0])
        dt_obj_pst = datetime.fromtimestamp(temp_during[i,0], pst_tz)
        data_hashtag_during[key][item_during][4] = int(datetime.strftime(dt_obj_pst, '%H'))
        
        if item_during not in num_followers_during.keys():
            num_followers_during[item_during] = []
        num_followers_during[item_during].append(temp_during[i,2])
    for i in num_followers_during.keys():
        data_hashtag_during[key][i][3] = np.max(num_followers_during[i])
        
    # Iterate through all elements after end time
    data_hashtag_after[key] = np.zeros((num_windows_after, 5))
    num_followers_after = {}
    for i in range(np.shape(temp_after)[0]):
        item_after = int((temp_after[i,0] - end_unix_time) // 3600)
        data_hashtag_after[key][item_after] += np.array([1, int(temp_after[i, 1]), int(temp_after[i, 2]), 0, 0])
        dt_obj_pst = datetime.fromtimestamp(temp_after[i,0], pst_tz)
        data_hashtag_after[key][item_after][4] = int(datetime.strftime(dt_obj_pst, '%H'))
        
        if item_after not in num_followers_after.keys():
            num_followers_after[item_after] = []
        num_followers_after[item_after].append(temp_after[i,2])
    for i in num_followers_after.keys():
        data_hashtag_after[key][i][3] = np.max(num_followers_after[i])
        
print('done')

gohawks
gopatriots
nfl
patriots
sb49
superbowl
done


In [9]:
# Aggregate data

# Initialize aggregated data variables
data_aggregate_before = np.zeros([num_windows_before, 5])
data_aggregate_during = np.zeros([num_windows_during, 5])
data_aggregate_after = np.zeros([num_windows_after, 5])

# Sum the # of tweets, total # of retweets, and # of followers
for key in hashtags:
    data_aggregate_before[:,0:3] += data_hashtag_before[key][:,0:3]
    data_aggregate_during[:,0:3] += data_hashtag_during[key][:,0:3]
    data_aggregate_after[:,0:3] += data_hashtag_after[key][:,0:3]
# Find the max # of followers for each
data_aggregate_before[:,3] = np.amax([data_hashtag_before[key][:,3] for key in hashtags], axis=0)
data_aggregate_during[:,3] = np.amax([data_hashtag_during[key][:,3] for key in hashtags], axis=0)
data_aggregate_after[:,3] = np.amax([data_hashtag_after[key][:,3] for key in hashtags], axis=0)

# Copy over the same time frames
data_aggregate_before[:,4] = data_hashtag_before['superbowl'][:,4]
data_aggregate_during[:,4] = data_hashtag_during['superbowl'][:,4]
data_aggregate_after[:,4] = data_hashtag_after['superbowl'][:,4]

### Nonlinear Regressions: Ensemble methods

#### Question 10

In [15]:
# Perform GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import KFold
import time

param_grid = {
    'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

In [42]:
def show_results(grid_search_cv, top_results=15):
    print("Top {} grid search scores on the basis of mean validation accuracy: ".format(top_results))
    print()
    means = grid_search_cv.cv_results_['mean_test_score']
    stds = grid_search_cv.cv_results_['std_test_score']
    params = grid_search_cv.cv_results_['params']
    
    np_rep = np.array([means, stds, params])
    np_rep = np_rep.T
    
    # sort this array
    sorted_gs = np_rep[(-np_rep[:,0]).argsort()]
    
    for i in range(top_results):
        mean, std, param = sorted_gs[i]
        print("%0.6f (+/-%0.06f) for %r" % (mean, std * 2, param))
    print()

    print("Optimal value of C: ")
    print()
    print(grid_search_cv.best_params_)
    print()

##### Analysis of data aggregated before Feb 1, 8:00 am

In [12]:
# Define train data and targets for BEFORE period
y_before = data_aggregate_before[1:,0] # Number of tweets (except first)
X_before = np.delete(data_aggregate_before, -1, 0) # Delete last row

print('X shape:', X_before.shape)
print('y shape:', y_before.shape)

X shape: (439, 5)
y shape: (439,)


In [18]:
# Random Forest regressor grid search
print('performing RF grid search...')

rf_cv_bef = GridSearchCV(RandomForestRegressor(), param_grid, cv=KFold(5, shuffle=True), scoring='neg_mean_squared_error',
                         verbose=1, n_jobs=4)
rf_cv_bef.fit(X_before, y_before)

performing RF grid search...
Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   16.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  7.1min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed: 10.3min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed: 14.2min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed: 18.9min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed: 23.8min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed: 29.4min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed: 35.4min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 41.9min
[Parallel(n_jobs=4)]: Done 7200 out of 7200 | elapsed: 42.0min finished


grid search took 2521.66 seconds


In [43]:
show_results(rf_cv_bef)

Top 15 grid search scores on the basis of mean validation accuracy: 

-3796709.438480 (+/-9260728.938884) for {'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'n_estimators': 200, 'min_samples_split': 2}
-3798615.766410 (+/-8101520.055596) for {'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 80, 'n_estimators': 800, 'min_samples_split': 10}
-3801999.676896 (+/-8149097.663559) for {'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 80, 'n_estimators': 600, 'min_samples_split': 10}
-3806065.756227 (+/-9416810.046392) for {'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'n_estimators': 600, 'min_samples_split': 5}
-3807089.707171 (+/-9319740.786582) for {'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 100, 'n_estimators': 1000, 'min_samples_split': 5}
-3812964.958369 (+/-8141324.618744) for {'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': None, 'n_estimators': 1600, 'min_samples_split': 10}
-3814777.875293 (+/-92

In [44]:
# GradientBoostingRegressor regressor grid search
print('performing GB grid search...')

gb_cv_bef = GridSearchCV(GradientBoostingRegressor(), param_grid, cv=KFold(5, shuffle=True), scoring='neg_mean_squared_error',
                         verbose=1, n_jobs=4)
gb_cv_bef.fit(X_before, y_before)

performing GB grid search...
Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   18.6s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   47.7s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  4.3min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  5.8min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  7.4min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  9.2min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed: 11.0min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 13.2min
[Parallel(n_jobs=4)]: Done 7200 out of 7200 | elapsed: 13.2min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'min_samples_split': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [45]:
show_results(gb_cv_bef)

Top 15 grid search scores on the basis of mean validation accuracy: 

-4547351.762900 (+/-8608540.176647) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 100, 'n_estimators': 1400, 'min_samples_split': 2}
-4744032.427315 (+/-8974336.689770) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'n_estimators': 400, 'min_samples_split': 2}
-4752388.329574 (+/-9040748.516853) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 80, 'n_estimators': 400, 'min_samples_split': 2}
-4768935.994495 (+/-9300204.802032) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'n_estimators': 800, 'min_samples_split': 5}
-4774098.289003 (+/-9241963.989489) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'n_estimators': 1000, 'min_samples_split': 2}
-4792073.917406 (+/-9094682.522391) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 80, 'n_estimators': 1600, 'min_samples_split': 10}
-4824671.230657 (+/-88667

##### Analysis of data aggregated between Feb 1, 8:00 am and Feb 1, 8:00 pm

In [46]:
y_during = data_aggregate_during[1:,0]
X_during = np.delete(data_aggregate_during, -1, 0)

print('X shape:', X_during.shape)
print('y shape:', y_during.shape)

X shape: (143, 5)
y shape: (143,)


In [47]:
# Random Forest regressor grid search
print('performing RF grid search...')

rf_cv_dur = GridSearchCV(RandomForestRegressor(), param_grid, cv=KFold(5, shuffle=True), scoring='neg_mean_squared_error',
                         verbose=1, n_jobs=4)
rf_cv_dur.fit(X_during, y_during)

performing RF grid search...
Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   12.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   57.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  3.9min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  6.1min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  8.9min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed: 12.1min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed: 15.8min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed: 20.0min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed: 24.7min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed: 30.0min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 35.7min
[Parallel(n_jobs=4)]: Done 7200 out of 7200 | elapsed: 35.7min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'min_samples_split': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [48]:
show_results(rf_cv_dur)

Top 15 grid search scores on the basis of mean validation accuracy: 

-22375832.235636 (+/-18126868.166056) for {'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'n_estimators': 200, 'min_samples_split': 10}
-22402504.692351 (+/-17809279.100428) for {'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'n_estimators': 800, 'min_samples_split': 10}
-22437809.501365 (+/-18625753.436627) for {'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 60, 'n_estimators': 400, 'min_samples_split': 10}
-22459492.788931 (+/-18167911.195121) for {'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'n_estimators': 1000, 'min_samples_split': 10}
-22495810.533135 (+/-17650250.880146) for {'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 200, 'n_estimators': 200, 'min_samples_split': 10}
-22524647.640517 (+/-18239702.041868) for {'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'n_estimators': 200, 'min_samples_split': 2}
-22533590

In [49]:
# GradientBoostingRegressor regressor grid search
print('performing GB grid search...')

gb_cv_dur = GridSearchCV(GradientBoostingRegressor(), param_grid, cv=KFold(5, shuffle=True), scoring='neg_mean_squared_error',
                         verbose=1, n_jobs=4)
gb_cv_dur.fit(X_during, y_during)

performing GB grid search...
Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 170 tasks      | elapsed:    7.8s
[Parallel(n_jobs=4)]: Done 620 tasks      | elapsed:   32.5s
[Parallel(n_jobs=4)]: Done 1344 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 1694 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 2144 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 2694 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done 3344 tasks      | elapsed:  3.1min
[Parallel(n_jobs=4)]: Done 4094 tasks      | elapsed:  3.9min
[Parallel(n_jobs=4)]: Done 4944 tasks      | elapsed:  4.7min
[Parallel(n_jobs=4)]: Done 5894 tasks      | elapsed:  5.6min
[Parallel(n_jobs=4)]: Done 6944 tasks      | elapsed:  6.6min
[Parallel(n_jobs=4)]: Done 7200 out of 7200 | elapsed:  6.9min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'min_samples_split': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [50]:
show_results(gb_cv_dur)

Top 15 grid search scores on the basis of mean validation accuracy: 

-26016035.436223 (+/-29443759.973838) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 100, 'n_estimators': 1600, 'min_samples_split': 5}
-26017401.691761 (+/-29074803.376647) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 80, 'n_estimators': 1000, 'min_samples_split': 2}
-26910814.492104 (+/-32117712.372015) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'n_estimators': 800, 'min_samples_split': 2}
-27086756.140145 (+/-26591957.649186) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'n_estimators': 1000, 'min_samples_split': 5}
-27329860.347148 (+/-30427611.225030) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'n_estimators': 600, 'min_samples_split': 2}
-27410617.355700 (+/-32148399.040868) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 100, 'n_estimators': 1800, 'min_samples_split': 2}
-27440592.

##### Analysis of data aggregated after Feb 1, 8:00 pm

In [51]:
y_after = data_aggregate_after[1:,0]
X_after = np.delete(data_aggregate_after, -1, 0)

print('X shape:', X_after.shape)
print('y shape:', y_after.shape)

X shape: (134, 5)
y shape: (134,)


In [52]:
# Random Forest regressor grid search
print('performing RF grid search...')

rf_cv_aft = GridSearchCV(RandomForestRegressor(), param_grid, cv=KFold(5, shuffle=True), scoring='neg_mean_squared_error',
                         verbose=1, n_jobs=4)
rf_cv_aft.fit(X_after, y_after)

performing RF grid search...
Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   13.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  4.3min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  6.8min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  9.7min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed: 13.3min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed: 17.3min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed: 21.6min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed: 26.3min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed: 31.4min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 37.0min
[Parallel(n_jobs=4)]: Done 7200 out of 7200 | elapsed: 37.1min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'min_samples_split': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [53]:
show_results(rf_cv_aft)

Top 15 grid search scores on the basis of mean validation accuracy: 

-333882.199840 (+/-458509.946463) for {'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 20, 'n_estimators': 600, 'min_samples_split': 5}
-334992.322405 (+/-458854.509039) for {'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 100, 'n_estimators': 400, 'min_samples_split': 5}
-335075.909955 (+/-458333.138494) for {'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 60, 'n_estimators': 800, 'min_samples_split': 2}
-335243.886419 (+/-450289.921372) for {'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 40, 'n_estimators': 200, 'min_samples_split': 5}
-335596.151715 (+/-446269.424462) for {'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 20, 'n_estimators': 1600, 'min_samples_split': 2}
-335912.733266 (+/-436155.719666) for {'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 80, 'n_estimators': 200, 'min_samples_split': 2}
-336014.206119 (+/-458552.698273) for {'

In [54]:
# GradientBoostingRegressor regressor grid search
print('performing GB grid search...')

gb_cv_aft = GridSearchCV(GradientBoostingRegressor(), param_grid, cv=KFold(5, shuffle=True), scoring='neg_mean_squared_error',
                         verbose=1, n_jobs=4)
gb_cv_aft.fit(X_after, y_after)

performing GB grid search...
Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:    7.6s
[Parallel(n_jobs=4)]: Done 623 tasks      | elapsed:   30.8s
[Parallel(n_jobs=4)]: Done 1373 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 2423 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done 3773 tasks      | elapsed:  3.2min
[Parallel(n_jobs=4)]: Done 5423 tasks      | elapsed:  4.6min
[Parallel(n_jobs=4)]: Done 7193 out of 7200 | elapsed:  6.1min remaining:    0.3s
[Parallel(n_jobs=4)]: Done 7200 out of 7200 | elapsed:  6.2min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'min_samples_split': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [55]:
show_results(gb_cv_aft)

Top 15 grid search scores on the basis of mean validation accuracy: 

-358803.781378 (+/-323368.631638) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'n_estimators': 200, 'min_samples_split': 2}
-363906.688643 (+/-305824.223997) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 80, 'n_estimators': 600, 'min_samples_split': 10}
-366639.660541 (+/-296311.724936) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'n_estimators': 1200, 'min_samples_split': 5}
-368976.174527 (+/-280540.723398) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'n_estimators': 1600, 'min_samples_split': 2}
-369143.272186 (+/-265105.238344) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'n_estimators': 1200, 'min_samples_split': 10}
-369691.782298 (+/-328128.233905) for {'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'n_estimators': 200, 'min_samples_split': 10}
-369739.566646 (+/-303663.342094