In [1]:
# Requirements: scikit-learn

# Suppress all warnings
import warnings
warnings.filterwarnings("ignore")

# Prevent codes being displayed when exporting to an HTML file
import IPython.core.display as di
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) {jQuery(".input_area").toggle(); jQuery(".prompt").toggle();} jQuery(".highlight").show();});</script>', raw=True)

# Dynamically add a button to show/hide codes
di.display_html("<script>jQuery(function() {jQuery(function() {var b = jQuery('<input type=\"button\" value=\"Show/Hide codes\"/>'); b.click(function(){jQuery('.input_area').each(function(){jQuery(this).toggle();});}); jQuery('[id^=Project]').parent().append(b);});});</script>", raw=True)

# Project 5: Popularity Prediction on Twitter

### Load data

In [1]:
import json
import os

folder_path = "./tweet_data" 
files= os.listdir(folder_path) # all the file name in the folder
all_tweets = {}
print(files)

['tweets_#gohawks.txt', 'tweets_#gopatriots.txt', 'tweets_#nfl.txt', 'tweets_#patriots.txt', 'tweets_#sb49.txt', 'tweets_#superbowl.txt']


In [1]:
for file in files: 
     if not os.path.isdir(file): # open file if it is not a folder
        f = open(folder_path + "/" + file) # open a file
        templist = []
        key = file[7:-4]
        for line in open(folder_path + "/" + file): 
            line = f.readline()
            tweet = json.loads(line)
            templist.append(tweet)
        all_tweets.setdefault(key, list)
        all_tweets[key] = templist
        f.close()
        print("done!")
print (len(all_tweets))

done!
done!
done!
done!
done!
done!
6


### Feature selection

In [2]:
# statistics per hour
# including 'Total number of retweets', 'Sum of the number of followers', 'Maximum number of followers' per hour
import datetime, time
import pytz

def feature_extract_perhour(start_time, end_time, tweetfile): 
    # only consider a part of the data, not every tweets in the file
    total_hours = (end_time - start_time) / 3600 + 1
    # print(total_hours)
    tweets_totalnum_perhour = [0 for i in range(total_hours)]
    retweets_num_perhour = [0 for i in range(total_hours)]
    followers_totalnum_perhour = [0 for i in range(total_hours)]
    followers_maxnum_perhour = [0 for i in range(total_hours)]
    time_of_day_perhour = [None for i in range(total_hours)]
    timestamp_perhour = []
    ts = start_time
    for i in range(total_hours):
        timestamp_perhour.append(ts)
        ts += 3600
    pst_tz = pytz.timezone('US/Pacific') 
    
    for tweet in tweetfile: 
        tweet_time = tweet['citation_date']
        if (tweet_time >= start_time and tweet_time < end_time + 3600):
            #hour = int((tweet_time - po_time_min)/3600)
            hour = int((tweet_time - start_time) / 3600)
            tweets_totalnum_perhour[hour] += 1
            retweets_num_perhour[hour] += tweet['tweet']['retweet_count']
            followers_totalnum_perhour[hour] += tweet['author']['followers']
            if (tweet['author']['followers'] >= followers_maxnum_perhour[hour]):
                followers_maxnum_perhour[hour] = tweet['author']['followers']
            if (time_of_day_perhour[hour] == None):
                # make the earliest time in each interval as the feature time_of_the_day 
                earliest_date = datetime.datetime.fromtimestamp(tweet['citation_date'], pst_tz)
                time_of_day_perhour[hour] = earliest_date.hour
    feat_5 = [tweets_totalnum_perhour[0:-1], retweets_num_perhour[0:-1], followers_totalnum_perhour[0:-1],\
              followers_maxnum_perhour[0:-1],time_of_day_perhour[0:-1]]
    gt_y = tweets_totalnum_perhour[1:]
    return timestamp_perhour[0:-1], feat_5, gt_y
    # return index_timestamp, X_features, y_label

In [3]:
def extract_features(start_time, end_time, tweetfile): 

    total_hours = (end_time - start_time) / 3600 + 1
    # other features:
    ori_author_followers = [0 for i in range(total_hours)] # total number of followers of the original author
    favorited_num = [0 for i in range(total_hours)] # total number of times of favorited, so many zeros
    impressions_num = [0 for i in range(total_hours)]
    avg_ranking_score = [0 for i in range(total_hours)]
    user_mentions = [0 for i in range(total_hours)]
    url_count = [0 for i in range(total_hours)]
    unique_author_set = [set() for i in range(total_hours)] # save unique author 
    
    # extract five basic features
    index, feat_5, gt_y = feature_extract_perhour(start_time, end_time, tweetfile)

    # extract other features
    for tweet in tweetfile:
        tweet_time = tweet['citation_date']
        if (tweet_time >= start_time and tweet_time < end_time +3600):
            hour = int((tweet_time - start_time) / 3600)
            ori_author_followers[hour] += tweet['original_author']['followers']
            favorited_num[hour] += tweet['tweet']['favorite_count']
            user_mentions[hour] += len(tweet['tweet']['entities']['user_mentions'])
            url_count[hour] += len(tweet['tweet']['entities']['urls'])
            unique_author_set[hour].add(tweet['author']['nick'])
            impressions_num[hour] += tweet['metrics']['impressions']
            avg_ranking_score[hour] += tweet['metrics']['ranking_score'] # need to do average later
            
    total_tweets = feat_5[0]
    for i in range(0, len(total_tweets)):
        if(total_tweets[i] != 0):
            avg_ranking_score[i] = avg_ranking_score[i] / total_tweets[i]
    
    unique_author_count =  [len(val) for val in unique_author_set]  # number of unique authors
    feat_extra = [ori_author_followers[0:-1], favorited_num[0:-1], \
                  user_mentions[0:-1], url_count[0:-1], unique_author_count[0:-1],\
                  impressions_num[0:-1],avg_ranking_score[0:-1]]

    feat_all = feat_5 + feat_extra

    return index, feat_all, gt_y

In [4]:
pst_tz = pytz.timezone('US/Pacific') 
round_timestamp_to_hour = lambda t: int(time.mktime(datetime.datetime.fromtimestamp(t, pst_tz).replace(microsecond=0,second=0,minute=0).timetuple()))

mins = {}
maxs = {}
for key in all_tweets:
    tmp_min = all_tweets[key][0]['citation_date']
    tmp_max = 0
    for tweet in all_tweets[key]:
        tmp_min = min(tmp_min, tweet['citation_date'])
        tmp_max = max(tmp_max, tweet['citation_date'])
    mins[key] = round_timestamp_to_hour(tmp_min)
    maxs[key] = round_timestamp_to_hour(tmp_max)

In [5]:
import numpy as np

def features_transform(x):
    # turn list of list (all features) into 2-D matrix: each row represents one 
    # reocrd(statistics for the one-hour interval), and add "1" to each row as bias
    # deal with the intervals without tweets, set it as the value of previous interval plus 1
    for i in range(len(x[4])):
        if x[4][i] == None:
            x[4][i] = (x[4][i - 1] + 1) % 24
    X = np.array(x)
    X = np.transpose(X)
    return X

In [6]:
import pickle

indexs, Xs, ys = [], [], []
for key in all_tweets:
    index, X, y = extract_features(mins[key], maxs[key], all_tweets[key])
    X = features_transform(X)
    y = np.array(y)
    indexs.append(index)
    Xs.append(X)
    ys.append(y)

pickle.dump(indexs, open("indexs.txt", "w"))
pickle.dump(Xs, open("Xs.txt", "w"))
pickle.dump(ys, open("ys.txt", "w"))

---
## Part 1.4 (1)

In [7]:
time1 = int(time.mktime(datetime.datetime(2015, 2, 1, 8, 0, 0, 0, pst_tz).timetuple()))
time2 = int(time.mktime(datetime.datetime(2015, 2, 1, 20, 0, 0, 0, pst_tz).timetuple()))

In [13]:
import statsmodels.api as sm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import math
from sklearn.model_selection import KFold

# Three models with 10-fold CV
def three_models(X, y):
    XX = sm.add_constant(X)
    kf = KFold(n_splits=10)
    rmses = [0.0, 0.0, 0.0]
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        XX_train, XX_test = XX[train_index], XX[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # linear model
        model = sm.regression.linear_model.OLS(y_train, XX_train)
        results = model.fit()
        prediction = results.predict(XX_test)
        rmses[0] += math.sqrt(np.mean((y_test - prediction) ** 2))
        # KNN
        knn = KNeighborsRegressor(n_neighbors=10, n_jobs=-1)
        knn.fit(X_train, y_train)
        prediction = knn.predict(X_test)
        rmses[1] += math.sqrt(np.mean((y_test - prediction) ** 2))
        # Random Forest
        regr = RandomForestRegressor(n_jobs=-1)
        regr.fit(X_train, y_train)
        prediction = regr.predict(X_test)
        rmses[2] += math.sqrt(np.mean((y_test - prediction) ** 2))
        
    return [rmse / kf.get_n_splits() for rmse in rmses]

In [9]:
indexs = pickle.load(open("indexs.txt", "r"))
Xs = pickle.load(open("Xs.txt", "r"))
ys = pickle.load(open("ys.txt", "r"))

In [38]:
rmses = {}
for i in range(len(indexs)):
    key = files[i][7:-4]
    rmses[key] = []
    idx1, idx2 = indexs[i].index(time1), indexs[i].index(time2)
    rmses[key].append(three_models(Xs[i][:idx1], ys[i][:idx1]))
    rmses[key].append(three_models(Xs[i][idx1:idx2], ys[i][idx1:idx2]))
    rmses[key].append(three_models(Xs[i][idx2:], ys[i][idx2:]))

print(rmses)
pickle.dump(rmses, open("rmses.txt", "w"))

{'#superbowl': [[619.0927559597669, 708.3330926087743, 601.2628847873727], [3211734.0183889656, 97051.21956256071, 62291.24750562848], [897.6026597678998, 436.3887982133343, 345.958304089594]], '#gohawks': [[1799.157413664255, 460.26660639333005, 439.7710467522843], [56243.01385213109, 3434.841477502307, 2564.205422485475], [11645.93581350966, 44.91700588898525, 37.976714518638296]], '#sb49': [[80.00425575213788, 109.44365859347195, 87.28732759151542], [906625.1719303888, 35314.632388853155, 24039.573214589185], [304.5990076060568, 324.8604096411933, 166.0133866828014]], '#nfl': [[215.3847328445482, 249.6460569616193, 230.3008494629621], [49412.59832187189, 4146.565595663316, 3043.2066758320802], [247.59590266110757, 195.5082185381349, 182.49507161364625]], '#patriots': [[892.275788744394, 512.2530887853854, 515.9094412559979], [331059.905450379, 14867.864908549114, 14958.434509048733], [445.81607184709355, 167.99246822427227, 157.10411794466108]], '#gopatriots': [[36.738081925075036, 

---
## Part 1.4 (2)

In [27]:
# aggregate through all tags
X_train, y_train = [], []
vector_X, vector_y = [[],[],[]], [[],[],[]]
for i in range(len(indexs)):
    idx1, idx2 = indexs[i].index(time1), indexs[i].index(time2)
    vector_X[0].append(Xs[i][:idx1])
    vector_y[0].append(ys[i][:idx1])
    vector_X[1].append(Xs[i][idx1:idx2])
    vector_y[1].append(ys[i][idx1:idx2])
    vector_X[2].append(Xs[i][idx2:])
    vector_y[2].append(ys[i][idx2:])

for i in range(3):
    X_train.append(np.concatenate(vector_X[i]))
    y_train.append(np.concatenate(vector_y[i]))

In [28]:
regr = RandomForestRegressor(n_jobs=-1)
rmses_all = []
# period 1
regr.fit(X_train[0], y_train[0])
prediction = regr.predict(X_train[0])
rmses_all.append(math.sqrt(np.mean((y_train[0] - prediction) ** 2)))
# period 2
regr.fit(X_train[1], y_train[1])
prediction = regr.predict(X_train[1])
rmses_all.append(math.sqrt(np.mean((y_train[1] - prediction) ** 2)))
# period 3
regr.fit(X_train[2], y_train[2])
prediction = regr.predict(X_train[2])
rmses_all.append(math.sqrt(np.mean((y_train[2] - prediction) ** 2)))
print(rmses_all)

[259.80248584566283, 17028.236340002222, 85.73163860168893]


---
## Part 1.5

### transform data to 5-hour time window

In [202]:
# using citation date, so do not split the periods (https://piazza.com/class/jcifzza0hzs2f3?cid=310)
vectors_X = []
vectors_y = []    
for i in range(len(Xs)):
    for j in range(len(Xs[i]) - 5):
        vectors_X.append(np.concatenate([Xs[i][j], Xs[i][j+1], Xs[i][j+2], Xs[i][j+3], Xs[i][j+4]]))
    vectors_y.extend(ys[i][5:])

X_new = np.vstack(vectors_X)
delete_index = [ 1, 2, 4, 6, 9,10,11,
                13,14,16,18,21,22,23,
                25,26,28,30,33,34,35,
                37,38,40,42,45,46,47,
                49,50,52,54,57,58,59]
X_new = np.delete(X_new, delete_index, 1) 
y_new = np.array(vectors_y)

In [206]:
regr = RandomForestRegressor(n_jobs=-1)
regr.fit(X_new, y_new)
print("Feature vector length: %d" % len(X_new[0]))

Feature vector length: 25


### feature extraction on test data (only last 6 hours data based on citation date)

In [135]:
def get_start_end_timestamp(tweetlist):
    tmp_max = 0
    for tweet in tweetlist:
        tmp_max = max(tmp_max, tweet['citation_date'])

    tmp_max = round_timestamp_to_hour(tmp_max)
    return tmp_max - 3600*5, tmp_max

In [208]:
folder_path = "./test_data" 
files= os.listdir(folder_path)
for file in files: 
     if not os.path.isdir(file):
        print("File: %s" % file)
        tweetlist = []
        with open(folder_path + "/" + file) as f: # open a file
            for line in f.readlines(): 
                tweet = json.loads(line)
                tweetlist.append(tweet)

        start_time, end_time = get_start_end_timestamp(tweetlist)
        index, X, y = extract_features(start_time, end_time, tweetlist)

        X[4][0] = datetime.datetime.fromtimestamp(start_time, pst_tz).hour
        for i in range(1, len(X[4])):
            X[4][i] = (X[4][i - 1] + 1) % 24

        X = np.array(X)
        X = np.transpose(X).astype('int')
        X = np.delete(X, [1,2,4,6,9,10,11], 1)
        y = np.array(y)

        y_test = y[-1]
        X_test = np.concatenate([X[0], X[1], X[2], X[3], X[4]])
        X_test = np.vstack([X_test])

        print("Truth: %d" % y_test)
        print("Predict: %d" % regr.predict(X_test)[0])

File: sample10_period3.txt
Truth: 61
Predict: 50
File: sample1_period1.txt
Truth: 1
Predict: 15
File: sample2_period2.txt
Truth: 4
Predict: 12
File: sample3_period3.txt
Truth: 523
Predict: 632
File: sample4_period1.txt
Truth: 201
Predict: 209
File: sample5_period1.txt
Truth: 1
Predict: 2
File: sample6_period2.txt
Truth: 14
Predict: 51
File: sample7_period3.txt
Truth: 120
Predict: 54
File: sample8_period1.txt
Truth: 11
Predict: 84
File: sample9_period2.txt
Truth: 1
Predict: 0
