In [1]:
#importing packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from datetime import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import os
import utils
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as stats_api
import pytz
pst_tz = pytz.timezone('America/Los_Angeles')

directory = 'C:/Users/madhu.kolli/Desktop/ECE 219/Project 5 Twitter/'

  from pandas.core import datetools


In [2]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as stats_api
from datetime import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import pytz
pst_tz = pytz.timezone('US/Pacific')

directory = 'C:/Users/madhu.kolli/Desktop/ECE 219/Project 5 Twitter/'

feature_names = ['Number of tweets', 'Total number of retweets', 'Total number of followers',
                 'Maximum number of followers', 'Hour of the day', 'Total tweets by Author',
                 'Author Passivity', 'Impression Count', 'Ranking Score', 'Mention Count',
                 'Graph Density', 'Avg Graph Degree']

tweettags = ["gohawks", "gopatriots", "nfl", "patriots", "sb49", "superbowl"]
window = 1
avg_past_features = False

default_feature_count = 5 #number of tweets, retweets, followers, max followers and hour of the day
extra_feature_count = len(feature_names)

start_time = datetime(2015, 2, 1, 8, 0, 0).replace(tzinfo=pst_tz)
end_time = datetime(2015, 2, 1, 20, 0, 0).replace(tzinfo=pst_tz)


class tweets():

    def get_feature_names(self):
        return feature_names
    
    
    def scale_data(self, X):
        return StandardScaler().fit_transform(X)
        

    def cross_validation(self, X, y, model = stats_api.OLS, n_splits = 10):

        test_rmse, test_mae = [], []
        kf = KFold(n_splits = n_splits)

        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            if model == stats_api.OLS:
                lm = model(y_train, X_train).fit()
            else:
                lm = model().fit(X_train, y_train)

            y_pred = lm.predict(X_test)
            test_mae.append(mean_absolute_error(y_pred, y_test))
            test_rmse.append(mean_squared_error(y_pred, y_test))


        avg_rmse = np.mean(test_rmse)
        print("Average Mean Absolute Error - ", np.mean(test_mae))
        print("Average Mean Squared Error - ", avg_rmse)
        print("Average Root Mean Squared Error - ", np.sqrt(avg_rmse))

    def test_model(self, X_train, y_train, X_test, y_test, model):

        lm = model().fit(X_train, y_train)
        y_pred = lm.predict(X_test)

        print("Actual number of tweets in next hour  - ", y_test)
        print("Predicted number of tweets in next hour - ", y_pred)
        print("Average Mean Absolute Error - ", mean_absolute_error(y_pred, y_test))


    def get_X_y(self, args):
        """
        Takes the data frame with features and converts it to the right input X and output y formats
        Uses Grouper for separating the tweets by time period.

        Takes an argument window - this determines how many past hour features determine current output
        """


        df = args['features'].set_index('time')
        tweets_by_hour = df.groupby(pd.Grouper(freq='60Min'))
        # key as hour (totally 500+ hours), value as the list of all tweet data in that hour

        num_hours = len(tweets_by_hour)
        num_features = extra_feature_count if args['extra_features'] else default_feature_count

        X = np.zeros((num_hours, num_features))

        for i, (key, val) in enumerate(tweets_by_hour):

            features = [len(val), val.retweets.sum(), val.followers.sum(), val.followers.max(), key.hour]

            if args['extra_features']:
                features.extend([val.user_tweet_count.mean(),
                                val.passivity.sum(), val.impression_count.mean(),
                                val.ranking_score.mean(), val.mention_count.sum(),
                                val.density.sum(), val.degree.sum()])

            X[i, :] = features

        window = args['window']
        y = X[:, 0][window:]
        # number of tweets is the output as well as the first feature - but have to shift by #window hours

        X = np.nan_to_num(X)
        X_window = np.zeros((num_hours - window, num_features * window))

        # previous window hours' data is the features for the current hour's number of tweets
        if args.get('avg_past_features',False):
            #average past 'window' hours features
            X_window = [np.mean(X[i:i+window, :], axis = 0) for i in range(num_hours - window)]

        else:
            #concatenate past 'window' hours features
            for i in range(num_hours - window):
                X_window[i, :] = np.concatenate([X[i+k, :] for k in range(window)])

        X_window = np.nan_to_num(X_window)

        #X_window = self.scale_data(X_window)
        #y = self.scale_data(y.reshape(-1,1))

        return X_window, y

    def format_date(self, date_string):

        return datetime.strptime(date_string, '%a %b %d %H:%M:%S %z %Y')


    def get_passivity(self, account_creation_date, tweet_date, tweets_posted):

        days_since_account_creation = (self.format_date(tweet_date) - self.format_date(account_creation_date)).days
        passivity = days_since_account_creation / (1 + tweets_posted)
        return passivity


    def get_features(self, args):
        """
        Gets the hashtag, type of model to fit the data.
        Passes the input to feature extraction methods and get the data required in right format
        """
        citation_date, followers, retweets, favorite_count, mention_count = [], [], [], [], []
        user_tweet_count, passivity, impression_count, ranking_score, user_id = [], [], [], [], []
        density, degree = [], []

        for tag in args['hashtags']:

            file_name = args.get('file_name') if args.get('file_name') else directory + 'tweets_#' + tag + '.txt'

            lines = open(file_name, 'r', encoding='utf8')

            for line in lines:
                tweet = json.loads(line)

                tweet_data = tweet['tweet']
                user_data = tweet_data['user']
                num_retweets = tweet['metrics']['citations']['total']
                mentions = len(tweet_data['entities']['user_mentions'])

                tweet_date = tweet['citation_date'] if args.get("citation_date", True) else tweet["firstpost_date"]

                citation_date.append(datetime.fromtimestamp(tweet_date, pst_tz))
                followers.append(tweet['author']['followers'])
                retweets.append(num_retweets)
                favorite_count.append(tweet_data['favorite_count'])
                mention_count.append(mentions)
                user_tweet_count.append(user_data['statuses_count'])
                impression_count.append(tweet['metrics']['impressions'])
                ranking_score.append(tweet['metrics']['ranking_score'])
                user_id.append(user_data['id'])
                passivity.append(self.get_passivity(user_data['created_at'], tweet_data['created_at'], user_data['statuses_count']))

                density.append(0 if num_retweets <= 1 else mentions / abs(num_retweets * (num_retweets - 1)))
                degree.append(0 if num_retweets <= 0 else (2 * mentions) / num_retweets)



        column_names = ['time', 'followers', 'retweets']
        features = [citation_date, followers, retweets]

        if args['extra_features']:
            column_names.extend(['favorite_count', 'user_tweet_count', 'passivity','impression_count',
                                 'ranking_score','user_id', 'mention_count', 'density', 'degree'])
            features.extend([favorite_count, user_tweet_count, passivity, impression_count,
                             ranking_score, user_id, mention_count, density, degree])

        df = pd.DataFrame(features)
        df = df.transpose()
        df.columns = column_names

        args['features'] = df

        return args


    def time_interval_model(self, features, model, cv = True):

        output = {}

        df = features["features"]

        features["features"] = df[df.time < start_time]
        output['X_bef'], output['y_bef'] = self.get_X_y(features)

        features["features"] = df[(df.time >= start_time) & (df.time <= end_time)]
        output['X_bet'], output['y_bet'] = self.get_X_y(features)

        features["features"] = df[df.time > end_time]
        output['X_aft'], output['y_aft'] = self.get_X_y(features)

        if cv:

            print("\nTime period before Feb 1 8am")
            self.cross_validation(output['X_bef'], output['y_bef'], model)

            print("\nTime period between Feb 1 8am and Feb 1 8pm")
            self.cross_validation(output['X_bet'], output['y_bet'], model)

            print("\nTime period after Feb 1 8pm")
            self.cross_validation(output['X_aft'], output['y_aft'], model)


        return output


    def collect_stats(self, hashtag):

        file_name = 'tweets_#' + hashtag + '.txt'
        citation_date, followers, retweets = [], [], []

        lines = open(directory + file_name, 'r', encoding='utf8')

        for i, line in enumerate(lines):
            tweet = json.loads(line)
            citation_date.append(datetime.fromtimestamp(tweet['citation_date'], pst_tz))
            followers.append(tweet['author']['followers'])
            retweets.append(tweet['metrics']['citations']['total'])

        df = pd.DataFrame(citation_date, columns=['time']).set_index('time')
        tweets_by_hour = df.groupby(pd.Grouper(freq='60Min'))
        count_by_hour = [len(val) for key, val in tweets_by_hour]


        total_hours = len(tweets_by_hour)

        print("Average number of tweets per hour", len(citation_date) / total_hours)
        print("Average number of followers - ", np.mean(followers))
        print("Average number of retweets - ", np.mean(retweets))

        if hashtag in ['superbowl', "nfl"]:

            plt.xlabel('Hours')
            plt.ylabel('Number of tweets')
            plt.title('Number of tweets per hour for ' + hashtag)
            plt.bar(range(len(tweets_by_hour.groups.keys())), count_by_hour)
            plt.show()


In [3]:
utils_obj = tweets()

start_time = datetime(2015, 2, 1, 8, 0, 0).replace(tzinfo=pst_tz)
end_time = datetime(2015, 2, 1, 20, 0, 0).replace(tzinfo=pst_tz)

args = {"extra_features": True, "window": window}

#models = {'OLS': stats_api.OLS, 'Random Forest': RandomForestRegressor, 'Neural Network MLP': MLPRegressor}
models = {'OLS': stats_api.OLS}

for tag in tweettags:
    print("=" * 50)
    print("\nHashtag - ", tag)

    for model_name, model in models.items():
        print("\nModel - ", model_name)

        args["hashtags"] = [tag]
        features = utils_obj.get_features(args)

        utils_obj.time_interval_model(features, model)




Hashtag -  gohawks

Model -  OLS

Time period before Feb 1 8am
Average Mean Absolute Error -  240.39405170543915
Average Mean Squared Error -  990349.0534161292
Average Root Mean Squared Error -  995.1628275896006

Time period between Feb 1 8am and Feb 1 8pm
Average Mean Absolute Error -  11629.124598032251
Average Mean Squared Error -  288825329.56786835
Average Root Mean Squared Error -  16994.861857863638

Time period after Feb 1 8pm
Average Mean Absolute Error -  54.50745422780035
Average Mean Squared Error -  21419.26708749751
Average Root Mean Squared Error -  146.3532271167859

Hashtag -  gopatriots

Model -  OLS

Time period before Feb 1 8am
Average Mean Absolute Error -  14.542643429900016
Average Mean Squared Error -  2287.2266746728174
Average Root Mean Squared Error -  47.824958700168445

Time period between Feb 1 8am and Feb 1 8pm
Average Mean Absolute Error -  710.9847290449928
Average Mean Squared Error -  793693.7160367297
Average Root Mean Squared Error -  890.8948961

In [4]:
print("\nPerforming analysis of aggregate data of all hashtags on best model - stats_api.OLS")

args = {"extra_features" : True, "window" : window}

best_model = stats_api.OLS

args["hashtags"] = tweettags #Aggregate data
features = utils_obj.get_features(args)
utils_obj.time_interval_model(features, best_model)


Performing analysis of aggregate data of all hashtags on best model - stats_api.OLS

Time period before Feb 1 8am
Average Mean Absolute Error -  666.0598869711318
Average Mean Squared Error -  4637293.783975622
Average Root Mean Squared Error -  2153.437666610209

Time period between Feb 1 8am and Feb 1 8pm
Average Mean Absolute Error -  112233.369055015
Average Mean Squared Error -  31484119330.20127
Average Root Mean Squared Error -  177437.6491340022

Time period after Feb 1 8pm
Average Mean Absolute Error -  1138.1620263964342
Average Mean Squared Error -  5804910.929565759
Average Root Mean Squared Error -  2409.33827628371


{'X_aft': array([[9.00000000e+02, 3.64300000e+03, 3.28936300e+07, ...,
         3.75000000e+02, 4.92442433e+01, 5.47485328e+02],
        [1.62710000e+04, 1.25763000e+05, 1.72921942e+08, ...,
         8.48400000e+03, 7.06362300e+02, 1.38876838e+04],
        [6.49200000e+03, 5.28580000e+04, 1.06280297e+08, ...,
         3.68900000e+03, 3.10494140e+02, 5.96790741e+03],
        ...,
        [8.50000000e+01, 2.42000000e+02, 1.35709400e+06, ...,
         9.50000000e+01, 1.29909051e+01, 1.26456588e+02],
        [5.90000000e+01, 4.70000000e+02, 1.48967600e+06, ...,
         3.50000000e+01, 6.66666667e+00, 4.96666667e+01],
        [4.90000000e+01, 1.02000000e+02, 5.47334000e+05, ...,
         5.20000000e+01, 2.29230769e+00, 8.80820513e+01]]),
 'X_bef': array([[1.11000000e+02, 7.14000000e+02, 2.23405000e+05, ...,
         4.50000000e+01, 3.23335114e+00, 6.87145985e+01],
        [8.90000000e+01, 6.63000000e+02, 1.87317000e+05, ...,
         2.50000000e+01, 4.38377660e+00, 3.41083333e+01],
       

In [5]:
print("\nPerforming analysis of aggregate data of all hashtags on best model - Random Forest")

args = {"extra_features" : True, "window" : window}

best_model = RandomForestRegressor

args["hashtags"] = tweettags #Aggregate data
features = utils_obj.get_features(args)
utils_obj.time_interval_model(features, best_model)



Performing analysis of aggregate data of all hashtags on best model - Random Forest

Time period before Feb 1 8am
Average Mean Absolute Error -  633.1772727272728
Average Mean Squared Error -  5009141.60504757
Average Root Mean Squared Error -  2238.1111690547386

Time period between Feb 1 8am and Feb 1 8pm
Average Mean Absolute Error -  82196.12999999999
Average Mean Squared Error -  9396890997.934002
Average Root Mean Squared Error -  96937.56236843385

Time period after Feb 1 8pm
Average Mean Absolute Error -  1301.733736263736
Average Mean Squared Error -  9645266.9499011
Average Root Mean Squared Error -  3105.683008599091


{'X_aft': array([[9.00000000e+02, 3.64300000e+03, 3.28936300e+07, ...,
         3.75000000e+02, 4.92442433e+01, 5.47485328e+02],
        [1.62710000e+04, 1.25763000e+05, 1.72921942e+08, ...,
         8.48400000e+03, 7.06362300e+02, 1.38876838e+04],
        [6.49200000e+03, 5.28580000e+04, 1.06280297e+08, ...,
         3.68900000e+03, 3.10494140e+02, 5.96790741e+03],
        ...,
        [8.50000000e+01, 2.42000000e+02, 1.35709400e+06, ...,
         9.50000000e+01, 1.29909051e+01, 1.26456588e+02],
        [5.90000000e+01, 4.70000000e+02, 1.48967600e+06, ...,
         3.50000000e+01, 6.66666667e+00, 4.96666667e+01],
        [4.90000000e+01, 1.02000000e+02, 5.47334000e+05, ...,
         5.20000000e+01, 2.29230769e+00, 8.80820513e+01]]),
 'X_bef': array([[1.11000000e+02, 7.14000000e+02, 2.23405000e+05, ...,
         4.50000000e+01, 3.23335114e+00, 6.87145985e+01],
        [8.90000000e+01, 6.63000000e+02, 1.87317000e+05, ...,
         2.50000000e+01, 4.38377660e+00, 3.41083333e+01],
       

In [6]:
print("\nPerforming analysis of aggregate data of all hashtags on best model - MLPRegressor")

args = {"extra_features" : True, "window" : window}

best_model = MLPRegressor

args["hashtags"] = tweettags #Aggregate data
features = utils_obj.get_features(args)
utils_obj.time_interval_model(features, best_model)


Performing analysis of aggregate data of all hashtags on best model - MLPRegressor

Time period before Feb 1 8am
Average Mean Absolute Error -  335281.9492754818
Average Mean Squared Error -  415683601196.3041
Average Root Mean Squared Error -  644735.2954479102

Time period between Feb 1 8am and Feb 1 8pm
Average Mean Absolute Error -  20750676.443706386
Average Mean Squared Error -  944394276747457.4
Average Root Mean Squared Error -  30730998.629192922

Time period after Feb 1 8pm
Average Mean Absolute Error -  738260.871572734
Average Mean Squared Error -  1311078597983.7805
Average Root Mean Squared Error -  1145023.4049938808


{'X_aft': array([[9.00000000e+02, 3.64300000e+03, 3.28936300e+07, ...,
         3.75000000e+02, 4.92442433e+01, 5.47485328e+02],
        [1.62710000e+04, 1.25763000e+05, 1.72921942e+08, ...,
         8.48400000e+03, 7.06362300e+02, 1.38876838e+04],
        [6.49200000e+03, 5.28580000e+04, 1.06280297e+08, ...,
         3.68900000e+03, 3.10494140e+02, 5.96790741e+03],
        ...,
        [8.50000000e+01, 2.42000000e+02, 1.35709400e+06, ...,
         9.50000000e+01, 1.29909051e+01, 1.26456588e+02],
        [5.90000000e+01, 4.70000000e+02, 1.48967600e+06, ...,
         3.50000000e+01, 6.66666667e+00, 4.96666667e+01],
        [4.90000000e+01, 1.02000000e+02, 5.47334000e+05, ...,
         5.20000000e+01, 2.29230769e+00, 8.80820513e+01]]),
 'X_bef': array([[1.11000000e+02, 7.14000000e+02, 2.23405000e+05, ...,
         4.50000000e+01, 3.23335114e+00, 6.87145985e+01],
        [8.90000000e+01, 6.63000000e+02, 1.87317000e+05, ...,
         2.50000000e+01, 4.38377660e+00, 3.41083333e+01],
       