# Q14

### Load and preprocess data 

In [25]:
import json
import os

path = "./tweet_data" 
files= os.listdir(path) 
print(files)
all_tweets = {}
for file in files: 
     if not os.path.isdir(file): 
        f = open(path + "/" + file) 
        templist = []
        key = file[7:-4]
        for line in open(path + "/" + file): 
            line = f.readline()
            tweet = json.loads(line)
            templist.append(tweet)
        all_tweets.setdefault(key, list)
        all_tweets[key] = templist
        f.close()


['tweets_#gohawks.txt', 'tweets_#gopatriots.txt', 'tweets_#nfl.txt', 'tweets_#patriots.txt', 'tweets_#sb49.txt', 'tweets_#superbowl.txt']


### Feature extraction in each time period

In [None]:
import datetime, time
import pytz

def feature_ext_period(start_time, end_time, tweetfile): 
    # only consider a part of the data, not every tweets in the file
    total_hours = (end_time - start_time) / 3600 + 1
    # print(total_hours)
    tweets_totalnum_perhour = [0 for i in range(total_hours)]
    retweets_num_perhour = [0 for i in range(total_hours)]
    followers_totalnum_perhour = [0 for i in range(total_hours)]
    followers_maxnum_perhour = [0 for i in range(total_hours)]
    time_of_day_perhour = [None for i in range(total_hours)]
    timestamp_perhour = []
    ts = start_time
    for i in range(total_hours):
        timestamp_perhour.append(ts)
        ts += 3600
    pst_tz = pytz.timezone('US/Pacific') 
    
    for tweet in tweetfile: 
        tweet_time = tweet['citation_date']
        if (tweet_time >= start_time and tweet_time < end_time + 3600):
            #hour = int((tweet_time - po_time_min)/3600)
            hour = int((tweet_time - start_time) / 3600)
            tweets_totalnum_perhour[hour] += 1
            retweets_num_perhour[hour] += tweet['tweet']['retweet_count']
            followers_totalnum_perhour[hour] += tweet['author']['followers']
            if (tweet['author']['followers'] >= followers_maxnum_perhour[hour]):
                followers_maxnum_perhour[hour] = tweet['author']['followers']
            if (time_of_day_perhour[hour] == None):
                earliest_date = datetime.datetime.fromtimestamp(tweet['citation_date'], pst_tz)
                time_of_day_perhour[hour] = earliest_date.hour
    feat_5 = [tweets_totalnum_perhour[0:-1], retweets_num_perhour[0:-1], followers_totalnum_perhour[0:-1],\
              followers_maxnum_perhour[0:-1],time_of_day_perhour[0:-1]]
    gt_y = tweets_totalnum_perhour[1:]
    return timestamp_perhour[0:-1], feat_5, gt_y
    # return index_timestamp, X_features, y_label

In [None]:
def extract_features(start_time, end_time, tweetfile): 

    total_hours = (end_time - start_time) / 3600 + 1
    # other features:
    ori_author_followers = [0 for i in range(total_hours)] 
    favorited_num = [0 for i in range(total_hours)] 
    impressions_num = [0 for i in range(total_hours)]
    avg_ranking_score = [0 for i in range(total_hours)]
    user_mentions = [0 for i in range(total_hours)]
    url_count = [0 for i in range(total_hours)]
    unique_author_set = [set() for i in range(total_hours)] 
    
    # extract basic features
    index, feat_5, gt_y = extract_features(start_time, end_time, tweetfile)

    for tweet in tweetfile:
        tweet_time = tweet['citation_date']
        if (tweet_time >= start_time and tweet_time < end_time +3600):
            hour = int((tweet_time - start_time) / 3600)
            ori_author_followers[hour] += tweet['original_author']['followers']
            favorited_num[hour] += tweet['tweet']['favorite_count']
            user_mentions[hour] += len(tweet['tweet']['entities']['user_mentions'])
            url_count[hour] += len(tweet['tweet']['entities']['urls'])
            unique_author_set[hour].add(tweet['author']['nick'])
            impressions_num[hour] += tweet['metrics']['impressions']
            avg_ranking_score[hour] += tweet['metrics']['ranking_score'] 
            
    total_tweets = feat_5[0]
    for i in range(0, len(total_tweets)):
        if(total_tweets[i] != 0):
            avg_ranking_score[i] = avg_ranking_score[i] / total_tweets[i]
    
    unique_author_count =  [len(val) for val in unique_author_set]  
    feat_extra = [ori_author_followers[0:-1], favorited_num[0:-1], \
                  user_mentions[0:-1], url_count[0:-1], unique_author_count[0:-1],\
                  impressions_num[0:-1],avg_ranking_score[0:-1]]

    feat_all = feat_5 + feat_extra

    return index, feat_all, gt_y

In [None]:
pst_tz = pytz.timezone('US/Pacific') 
round_timestamp_to_hour = lambda t: int(time.mktime(datetime.datetime.fromtimestamp(t, pst_tz).replace(microsecond=0,second=0,minute=0).timetuple()))

mins = {}
maxs = {}
for key in all_tweets:
    tmp_min = all_tweets[key][0]['citation_date']
    tmp_max = 0
    for tweet in all_tweets[key]:
        tmp_min = min(tmp_min, tweet['citation_date'])
        tmp_max = max(tmp_max, tweet['citation_date'])
    mins[key] = round_timestamp_to_hour(tmp_min)
    maxs[key] = round_timestamp_to_hour(tmp_max)

In [None]:
import numpy as np

def features_transform(x):
    for i in range(len(x[4])):
        if x[4][i] == None:
            x[4][i] = (x[4][i - 1] + 1) % 24
    X = np.array(x)
    X = np.transpose(X)
    return X

In [None]:
import pickle
indexs, Xs, Ys = [], [], []
for key in all_tweets:
    index, X, y = extract_features(mins[key], maxs[key], all_tweets[key])
    X = features_transform(X)
    y = np.array(y)
    indexs.append(index)
    Xs.append(X)
    Ys.append(y)

pickle.dump(indexs, open("indexs.txt", "w"))
pickle.dump(Xs, open("Xs.txt", "w"))
pickle.dump(Ys, open("Ys.txt", "w"))

In [None]:
time1 = int(time.mktime(datetime.datetime(2015, 2, 1, 8, 0, 0, 0, pst_tz).timetuple()))
time2 = int(time.mktime(datetime.datetime(2015, 2, 1, 20, 0, 0, 0, pst_tz).timetuple()))

In [None]:
import statsmodels.api as sm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import math
from sklearn.model_selection import KFold

# regression models
def regression_models(X, y):
    XX = sm.add_constant(X)
    kf = KFold(n_splits=10)
    rmses = [0.0, 0.0, 0.0]
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        XX_train, XX_test = XX[train_index], XX[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # linear
        model = sm.regression.linear_model.OLS(y_train, XX_train)
        results = model.fit()
        prediction = results.predict(XX_test)
        rmses[0] += math.sqrt(np.mean((y_test - prediction) ** 2))
        # KNN
        knn = KNeighborsRegressor(n_neighbors=10, n_jobs=-1)
        knn.fit(X_train, y_train)
        prediction = knn.predict(X_test)
        rmses[1] += math.sqrt(np.mean((y_test - prediction) ** 2))
        # Random Forrest
        regr = RandomForestRegressor(n_jobs=-1)
        regr.fit(X_train, y_train)
        prediction = regr.predict(X_test)
        rmses[2] += math.sqrt(np.mean((y_test - prediction) ** 2))
        
    return [rmse / kf.get_n_splits() for rmse in rmses]

In [None]:
indexs = pickle.load(open("indexs.txt", "r"))
Xs = pickle.load(open("Xs.txt", "r"))
Ys = pickle.load(open("Ys.txt", "r"))

In [None]:
rmses = {}
for i in range(len(indexs)):
    key = files[i][7:-4]
    rmses[key] = []
    idx1, idx2 = indexs[i].index(time1), indexs[i].index(time2)
    rmses[key].append(regression_models(Xs[i][:idx1], Ys[i][:idx1]))
    rmses[key].append(regression_models(Xs[i][idx1:idx2], Ys[i][idx1:idx2]))
    rmses[key].append(regression_models(Xs[i][idx2:], Ys[i][idx2:]))

print(rmses)
pickle.dump(rmses, open("rmses.txt", "w"))

In [None]:
# aggregate through all tags
X_train, y_train = [], []
vector_X, vector_y = [[],[],[]], [[],[],[]]
for i in range(len(indexs)):
    idx1, idx2 = indexs[i].index(time1), indexs[i].index(time2)
    vector_X[0].append(Xs[i][:idx1])
    vector_y[0].append(Ys[i][:idx1])
    vector_X[1].append(Xs[i][idx1:idx2])
    vector_y[1].append(Ys[i][idx1:idx2])
    vector_X[2].append(Xs[i][idx2:])
    vector_y[2].append(Ys[i][idx2:])

for i in range(3):
    X_train.append(np.concatenate(vector_X[i]))
    y_train.append(np.concatenate(vector_y[i]))

In [None]:
regr = RandomForestRegressor(n_jobs=-1)
rmses_all = []
# period 1
regr.fit(X_train[0], y_train[0])
prediction = regr.predict(X_train[0])
rmses_all.append(math.sqrt(np.mean((y_train[0] - prediction) ** 2)))
# period 2
regr.fit(X_train[1], y_train[1])
prediction = regr.predict(X_train[1])
rmses_all.append(math.sqrt(np.mean((y_train[1] - prediction) ** 2)))
# period 3
regr.fit(X_train[2], y_train[2])
prediction = regr.predict(X_train[2])
rmses_all.append(math.sqrt(np.mean((y_train[2] - prediction) ** 2)))
print(rmses_all)

### transform data to 6 hour time window

In [None]:

vectors_X = []
vectors_y = []    
for i in range(len(Xs)):
    for j in range(len(Xs[i]) - 5):
        vectors_X.append(np.concatenate([Xs[i][j], Xs[i][j+1], Xs[i][j+2], Xs[i][j+3], Xs[i][j+4]]))
    vectors_y.extend(Ys[i][5:])

X_new = np.vstack(vectors_X)
delete_index = [ 1, 2, 4, 6, 9,10,11,
                13,14,16,18,21,22,23,
                25,26,28,30,33,34,35,
                37,38,40,42,45,46,47,
                49,50,52,54,57,58,59]
X_new = np.delete(X_new, delete_index, 1) 
y_new = np.array(vectors_y)

In [None]:
regr = RandomForestRegressor(n_jobs=-1)
regr.fit(X_new, y_new)
print("Feature vector length: %d" % len(X_new[0]))

### feature extraction on test data 

In [None]:
def get_start_end_timestamp(tweetlist):
    tmp_max = 0
    for tweet in tweetlist:
        tmp_max = max(tmp_max, tweet['citation_date'])

    tmp_max = round_timestamp_to_hour(tmp_max)
    return tmp_max - 3600*5, tmp_max

In [None]:
file_path = "./test_data" 
files= os.listdir(file_path)
for file in files: 
     if not os.path.isdir(file):
        print("File: %s" % file)
        tweetlist = []
        with open(file_path + "/" + file) as f: 
            for line in f.readlines(): 
                tweet = json.loads(line)
                tweetlist.append(tweet)

        start_time, end_time = get_start_end_timestamp(tweetlist)
        index, X, y = extract_features(start_time, end_time, tweetlist)

        X[4][0] = datetime.datetime.fromtimestamp(start_time, pst_tz).hour
        for i in range(1, len(X[4])):
            X[4][i] = (X[4][i - 1] + 1) % 24

        X = np.array(X)
        X = np.transpose(X).astype('int')
        X = np.delete(X, [1,2,4,6,9,10,11], 1)
        y = np.array(y)

        y_test = y[-1]
        X_test = np.concatenate([X[0], X[1], X[2], X[3], X[4]])
        X_test = np.vstack([X_test])

        print("Truth: %d" % y_test)
        print("Predict: %d" % regr.predict(X_test)[0])