In [1]:
############ Importing Packages First #################
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from datetime import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import os
import utils
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as stats_api
import pytz
pst_tz = pytz.timezone('America/Los_Angeles')
############ Importing Packages First #################

# Set up the path for files #
directory = 'C:/Users/madhu.kolli/Desktop/ECE 219/Project 5 Twitter/'
tweetfeatures = ['Number of tweets', 'Total number of retweets', 'Total number of followers',
                 'Maximum number of followers', 'Hour of the day', 'Total tweets by Author',
                 'Author Passivity', 'Impression Count', 'Ranking Score', 'Mention Count',
                 'Graph Density', 'Avg Graph Degree']
tweettags = ["gohawks", "gopatriots", "nfl", "patriots", "sb49", "superbowl"]

  from pandas.core import datetools


In [2]:
class tweets():
    def get_X_y(self, args):
        df = args['features'].set_index('time')
        tweets_by_hour = df.groupby(pd.Grouper(freq='60Min'))
        # key as hour (totally 500+ hours), value as the list of all tweet data in that hour

        num_hours = len(tweets_by_hour)
        num_features = extra_feature_count if args['extra_features'] else default_feature_count

        X = np.zeros((num_hours, num_features))

        for i, (key, val) in enumerate(tweets_by_hour):

            features = [len(val), val.retweets.sum(), val.followers.sum(), val.followers.max(), key.hour]

            if args['extra_features']:
                features.extend([val.user_tweet_count.mean(),
                                val.passivity.sum(), val.impression_count.mean(),
                                val.ranking_score.mean(), val.mention_count.sum(),
                                val.density.sum(), val.degree.sum(),val.url_count.sum(),val.hashtag_count.sum()])

            X[i, :] = features

        window = args['window']
        y = X[:, 0][window:]
        X = np.nan_to_num(X)
        X_window = np.zeros((num_hours - window, num_features * window))
        if args.get('avg_past_features',False):
            #average past 'window' hours features
            X_window = [np.mean(X[i:i+window, :], axis = 0) for i in range(num_hours - window)]

        else:
            #concatenate past 'window' hours features
            for i in range(num_hours - window):
                X_window[i, :] = np.concatenate([X[i+k, :] for k in range(window)])

        X_window = np.nan_to_num(X_window)

        #X_window = self.scale_data(X_window)
        #y = self.scale_data(y.reshape(-1,1))

        return X_window, y
    def format_date(self, date_string):

        return datetime.strptime(date_string, '%a %b %d %H:%M:%S %z %Y')
    def get_passivity(self, account_creation_date, tweet_date, tweets_posted):

        days_since_account_creation = (self.format_date(tweet_date) - self.format_date(account_creation_date)).days
        passivity = days_since_account_creation / (1 + tweets_posted)
        return passivity
    def get_features(self, args):
        """
        Gets the hashtag, type of model to fit the data.
        Passes the input to feature extraction methods and get the data required in right format
        """
        citation_date, followers, retweets, favorite_count, mention_count = [], [], [], [], []
        user_tweet_count, passivity, impression_count, ranking_score, user_id = [], [], [], [], []
        density, degree = [], []
        url_count,hashtag_count = [], []

        for tag in args['tweettags']:

            file_name = args.get('file_name') if args.get('file_name') else directory + 'tweets_#' + tag + '.txt'

            lines = open(file_name, 'r', encoding='utf8')

            for line in lines:
                tweet = json.loads(line)

                tweet_data = tweet['tweet']
                user_data = tweet_data['user']
                num_retweets = tweet['metrics']['citations']['total']
                mentions = len(tweet_data['entities']['user_mentions'])

                tweet_date = tweet['citation_date'] if args.get("citation_date", True) else tweet["firstpost_date"]

                citation_date.append(datetime.fromtimestamp(tweet_date, pst_tz))
                followers.append(tweet['author']['followers'])
                retweets.append(num_retweets)
                favorite_count.append(tweet_data['favorite_count'])
                mention_count.append(mentions)
                user_tweet_count.append(user_data['statuses_count'])
                impression_count.append(tweet['metrics']['impressions'])
                ranking_score.append(tweet['metrics']['ranking_score'])
                user_id.append(user_data['id'])
                passivity.append(self.get_passivity(user_data['created_at'], tweet_data['created_at'], user_data['statuses_count']))

                density.append(0 if num_retweets <= 1 else mentions / abs(num_retweets * (num_retweets - 1)))
                degree.append(0 if num_retweets <= 0 else (2 * mentions) / num_retweets)
                url_count.append(len(tweet['tweet']['entities']['urls']))
                hashtag_count.append(tweet['title'].count('#'))

        column_names = ['time', 'followers', 'retweets']
        features = [citation_date, followers, retweets]

        if args['extra_features']:
            column_names.extend(['favorite_count', 'user_tweet_count', 'passivity','impression_count',
                                 'ranking_score','user_id', 'mention_count', 'density', 'degree','url_count','hashtag_count'])
            features.extend([favorite_count, user_tweet_count, passivity, impression_count,
                             ranking_score, user_id, mention_count, density, degree,url_count,hashtag_count])

        df = pd.DataFrame(features)
        df = df.transpose()
        df.columns = column_names

        args['features'] = df

        return args

In [3]:
#Question 3
window = 1
avg_past_features = False
default_feature_count = 5 #number of tweets, retweets, followers, max followers and hour of the day

args = {"extra_features": False, "window": window, "avg_past_features": avg_past_features, "scale_input": False}
utils_obj = tweets()

for tag in tweettags:
    print("=" * 50)
    print("\nLinear Regression for ", tag)
    print("\nWindow = ", window)

    args["tweettags"] = [tag]
    features = utils_obj.get_features(args)
    
    X, y = utils_obj.get_X_y(features)

    predictor_model = stats_api.OLS(y, X).fit()
    mse = mean_squared_error(y, predictor_model.predict())

    print("MSE - ", mse)
    print("RMSE - ", np.sqrt(mse))
    print("MAE- ", mean_absolute_error(y, predictor_model.predict()))
    print(predictor_model.summary())


Linear Regression for  gohawks

Window =  1
MSE -  760963.8885815816
RMSE -  872.3324415505717
MAE-  201.128491188395
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.504
Model:                            OLS   Adj. R-squared:                  0.500
Method:                 Least Squares   F-statistic:                     116.5
Date:                Mon, 11 Jun 2018   Prob (F-statistic):           7.10e-85
Time:                        10:38:35   Log-Likelihood:                -4733.9
No. Observations:                 578   AIC:                             9478.
Df Residuals:                     573   BIC:                             9500.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------

MSE -  16199530.452681413
RMSE -  4024.8640290923386
MAE-  562.0363584604992
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.808
Model:                            OLS   Adj. R-squared:                  0.807
Method:                 Least Squares   F-statistic:                     486.4
Date:                Mon, 11 Jun 2018   Prob (F-statistic):          3.24e-204
Time:                        10:43:48   Log-Likelihood:                -5656.6
No. Observations:                 582   AIC:                         1.132e+04
Df Residuals:                     577   BIC:                         1.134e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------