# Basic Regression Models

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # to split data
import statsmodels.formula.api as smf # quantile regression
from sklearn.ensemble import GradientBoostingRegressor # gradient boosting regression

## Import pre-processed data

In [4]:
train_data = pd.read_pickle("Preprocessing_Data_train_processed.pkl")
eval_data = pd.read_pickle("Preprocessing_Data_eval_processed.pkl")
#train_data = pd.read_pickle("../Preprocessing/Data/train_processed.pkl")
#eval_data = pd.read_pickle("../Preprocessing/Data/eval_processed.pkl")

In [8]:
X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweet_count'], stratify=train_data['retweet_count'], 
                                           train_size=0.7, test_size=0.3)

## Constant null prediction

In [11]:
y_pred = np.zeros(len(y_test))

urltrue_test = pd.notna(X_test['urls']) # rows from X_test corresponding to tweets with at least one URL

print("We input a value of 0 for all #retweets")

print("MAE on tweets with url:", mean_absolute_error(y_true=y_test[urltrue_test], y_pred=y_pred[urltrue_test]))
print("MAE on tweets without url:", mean_absolute_error(y_true=y_test[~urltrue_test], y_pred=y_pred[~urltrue_test]))
print("Overall MAE:", mean_absolute_error(y_true=y_test, y_pred=y_pred))

We input a value of 0 for all #retweets
MAE on tweets with url: 272.4794352267956
MAE on tweets without url: 85.13883699464648
Overall MAE: 145.45749346631018


## Quantile Regression

In [24]:
y_pred = np.zeros(len(y_test))

urltrue_train = pd.notna(X_train['urls']) # rows from X_train corresponding to tweets with at least one URL
urltrue_test = pd.notna(X_test['urls']) # rows from X_test corresponding to tweets with at least one URL

reg_data = pd.concat([X_train[['user_followers_count', 'user_verified', 'url_count' , 'text_length']][urltrue_train], y_train[urltrue_train]], axis = 1)

mod = smf.quantreg('retweet_count ~ user_followers_count + user_verified + url_count + text_length', reg_data)

res = mod.fit(q=.5, max_iter = 100)

y_pred[urltrue_test] = res.predict(X_test[['user_followers_count', 'user_verified', 'url_count' , 'text_length']][urltrue_test])

print("MAE on tweets with url:", mean_absolute_error(y_true = y_test[urltrue_test], y_pred = y_pred[urltrue_test]))
print("MAE on tweets without url:", mean_absolute_error(y_true = y_test[~urltrue_test], y_pred = y_pred[~urltrue_test]))
print("Overall MAE:", mean_absolute_error(y_true=y_test, y_pred = y_pred))

MAE on tweets with url: 267.7827013405926
MAE on tweets without url: 85.13883699464648
Overall MAE: 143.94527091287495




## Gradient Boosting Regression

In [25]:
y_pred = np.zeros(len(y_test))

urltrue_train = pd.notna(X_train['urls'])
urltrue_test = pd.notna(X_test['urls'])

reg = GradientBoostingRegressor(loss = 'lad')

reg.fit(X_train[['user_followers_count',  'user_verified', 'url_count', 'text_length']][urltrue_train], y_train[urltrue_train])

y_pred[urltrue_test] = reg.predict(X_test[['user_followers_count',  'user_verified', 'url_count', 'text_length']][urltrue_test])

print("MAE on tweets with url:", mean_absolute_error(y_true = y_test[urltrue_test], y_pred = y_pred[urltrue_test]))
print("MAE on tweets without url:", mean_absolute_error(y_true = y_test[~urltrue_test], y_pred = y_pred[~urltrue_test]))
print("Overall MAE:", mean_absolute_error(y_true = y_test, y_pred = y_pred))

MAE on tweets with url: 257.7954116419464
MAE on tweets without url: 85.13883699464648
Overall MAE: 140.72963104570042
