In [65]:
import json
import datetime
import pandas as pd
import numpy as np
import pytz
pst_tz = pytz.timezone('America/Los_Angeles')


import warnings
warnings.filterwarnings("ignore")

In [66]:
def get_X_y(df,window):
    num_features = 5
    
    tweets_by_hour = df.set_index('TimeStamp').groupby(pd.Grouper(freq='60Min'))
    
    total_hours = len(tweets_by_hour)

    X = np.zeros((total_hours, 5))
    for i, (key, val) in enumerate(tweets_by_hour):
            features = [len(val), val.nReTweet.sum(), val.nFollowers.sum(), val.nFollowers.max(), key.hour]
            X[i, :] = features
            
    y = X[:, 0][window:]
    # number of tweets is the output as well as the first feature - but have to shift by #window hours
    X = np.nan_to_num(X)
    X_window = np.zeros((total_hours - window, num_features * window))
    
    for i in range(total_hours - window):
        X_window[i, :] = np.concatenate([X[i+k, :] for k in range(window)])
        
    X_window = np.nan_to_num(X_window)
    return X_window, y

In [67]:
piece_time = datetime.datetime(2015, 2, 1, 8, 0).timestamp()
piece_time_pst = datetime.datetime.fromtimestamp(piece_time, pst_tz)
piece_time_2 = datetime.datetime(2015, 2, 1, 20, 0).timestamp()
piece_time_2_pst = datetime.datetime.fromtimestamp(piece_time_2, pst_tz)

In [68]:
def tweet_to_df(tweet_d):
    df = pd.DataFrame(tweet_d, columns=['ID', 'Title', 'nFollowers', 'nReTweet','Ranking Score','impressions','TimeStamp'])
    df.sort_values(by='TimeStamp',inplace = True)
    return df

In [69]:
def extract_tweets_into_df(file):
    d = []
    for line in open(file):
        temp = []
        t = json.loads(line)
        time = datetime.datetime.fromtimestamp(t['citation_date'], pst_tz)
        #if(time < piece_time_pst ):
        temp.append(t['tweet']['id'])
        temp.append(t['title'])
        #temp.append(t['user'])
        temp.append(t['author']['followers'])
        temp.append(t['metrics']['citations']['total'])
        temp.append(t['metrics']['ranking_score'])
        temp.append(t['metrics']['impressions'])
        temp.append(datetime.datetime.fromtimestamp(t['citation_date'], pst_tz))
        d.append(temp)
    return tweet_to_df(d)



In [70]:
nfl_df = extract_tweets_into_df("data/tweets_#nfl.txt")

In [71]:
sb_df = extract_tweets_into_df("data/tweets_#superbowl.txt")

In [72]:
sb49_df = extract_tweets_into_df("data/tweets_#sb49.txt")

In [73]:
pat_df = extract_tweets_into_df("data/tweets_#patriots.txt")

In [74]:
hawks_df = extract_tweets_into_df("data/tweets_#gohawks.txt")

In [75]:
go_pat_df = extract_tweets_into_df("data/tweets_#gopatriots.txt")

In [76]:
frame = [nfl_df,sb_df,sb49_df,pat_df,hawks_df,go_pat_df]
#aggregated df
df = pd.concat(frame)

In [77]:
min_time = df.set_index('TimeStamp').index.min()
max_time = df.set_index('TimeStamp').index.max()
print (min_time)
print (max_time)

total_hours = (max_time - min_time)
total_hours = total_hours.to_timedelta64().astype('timedelta64[h]') + 1
print(total_hours)

2015-01-14 00:00:04-08:00
2015-02-07 10:55:36-08:00
587 hours


In [78]:
X,Y = get_X_y(df, 1)

In [79]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [80]:
from sklearn.neural_network import MLPRegressor
from sklearn  import metrics



In [34]:
def computeMLP(mlp, X_train, y_train,X_test, y_test):
    mlp.fit(X_train, y_train)
    mlp.score(X_test, y_test)
    predictions = mlp.predict(X_test)

    y_pred = mlp.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_pred)


    #print("MSE: %.4f" % mse)
    return mse

### Defaul MLP

1 hidden layer with 100 hidden units

In [40]:
mlp = MLPRegressor()
mlp


MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [41]:
mse = computeMLP(mlp,X_train, y_train,X_test, y_test )
print("MSE Value of arch #1 %.4f"%mse) 

MSE Value of arch #1 40501079369.6721


### 2nd Architecture
(10,10,10) if you want 3 hidden layers with 10 hidden units each. ### 

In [45]:
mlp = MLPRegressor(hidden_layer_sizes=(10,10,10))

In [46]:
mse = computeMLP(mlp,X_train, y_train,X_test, y_test )
print("MSE Value of arch #2 %.4f"%mse)

MSE Value of arch #2 3879255768246.4858


### 3rd Architecture
architecture 56:25:11:7:5:3:1 with input 56 and 1 output hidden layers will be (25:11:7:5:3). So tuple hidden_layer_sizes = (25,11,7,5,3,)

In [47]:
mlp = MLPRegressor(hidden_layer_sizes=(25,11,7,5,3,))
mse = computeMLP(mlp,X_train, y_train,X_test, y_test )
print("MSE Value of arch #3 %.4f"%mse)

MSE Value of arch #3 629122112.8263


### 4th Architecture
architecture 3:45:2:11:2 with input 3 and 2 output hidden layers will be (45:2:11). So tuple hidden_layer_sizes = (45,2,11,)

In [48]:
mlp = MLPRegressor(hidden_layer_sizes=(45,2,11,))
mse = computeMLP(mlp,X_train, y_train,X_test, y_test )
print("MSE Value of arch #4 %.4f"%mse)

MSE Value of arch #4 629108381.9237


### 5th Architecture
1 hidden layer with 7 hidden units.



In [58]:
mlp = MLPRegressor(hidden_layer_sizes=(30,30,30))
mse = computeMLP(mlp,X_train, y_train,X_test, y_test )
print("MSE Value of arch #5 %.4f"%mse)

MSE Value of arch #5 186745293.9044


# Scaling of data on 5th Architecture

In [81]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [82]:

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)



In [83]:
mse = computeMLP(mlp,X_train, y_train,X_test, y_test )
print("MSE Value of arch #5 %.4f"%mse)

MSE Value of arch #5 154110962.9271


In [84]:
predictions = mlp.predict(X_test)

y_pred = mlp.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)


print("MSE: %.4f" % mse)





MSE: 154110962.9271
