In [194]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import GammaRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from datetime import datetime
import statsmodels.discrete.count_model as reg_models
import statsmodels

In [222]:
# Load the training data
tweets = pd.read_csv("data/train.csv")

In [233]:
X = tweets[tweets.retweet_count > 10].copy()
X = X[X.retweet_count < 1000]
X.shape

(86488, 11)

In [234]:
y = X.retweet_count.copy()
X = X[['user_verified', 'user_statuses_count', 'user_followers_count']]
# adding extra features : size of the text
#X.insert(0, 'size_text', tweets.text.apply(lambda x: len(x)), True)

# adding extra features : number of hashtags
def count_hashtags_in_text(text):
    return text.count('#')
X.insert(0, 'hashtag_count', tweets.text.apply(count_hashtags_in_text), True)

# Converting timestamp in hour
def timestamp_13_digits_to_hour(t):
    dt = datetime.fromtimestamp(t / 1000)
    return dt.hour
#X.timestamp = X.timestamp.apply(timestamp_13_digits_to_hour)

# Converting the True / False values of user_verified into 1 / 0
X['user_verified'] = (X['user_verified']).astype(int)

X.head(3)

Unnamed: 0,hashtag_count,user_verified,user_statuses_count,user_followers_count
17,0,0,15848,1257
24,0,0,10424,3083
43,7,0,24,146


In [235]:
X.user_statuses_count = np.log(1 + X.user_statuses_count)
X.user_followers_count = np.log(1 + X.user_followers_count)

In [236]:
# Center and normalize the datas
X = X - X.mean()
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head(3)

Unnamed: 0,hashtag_count,user_verified,user_statuses_count,user_followers_count
0,0.0,0.0,0.602709,0.385741
1,0.0,0.0,0.574586,0.434204
2,0.241379,0.0,0.169562,0.269712


In [237]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [238]:
def cv_worst(estimators):
    mae_test, mae_train = 0, 0
    sgd = None
    for esti in estimators:
        y_test_sgd = esti.predict(X_test)
        y_test_sgd = y_test_sgd.astype(int)
        y_train_sgd = esti.predict(X_train)
        y_train_sgd = y_train_sgd.astype(int)
        cur_mae_test = mean_absolute_error(y_true=y_test, y_pred=y_test_sgd)
        cur_mae_train = mean_absolute_error(y_true=y_train, y_pred=y_train_sgd)
        if cur_mae_test > mae_test:
            mae_test = cur_mae_test
            mae_train = cur_mae_train
            sgd = esti
    print("Prediction error on test set :", mae_test)
    print("Prediction error on train set :", mae_train)
    
    return sgd

Trying SGD regression

In [239]:
sgd = SGDRegressor(max_iter=1000, tol=1e-4)
sgd_estimators = cross_validate(sgd, X, y, return_estimator=True)['estimator']
sgd = cv_worst(sgd_estimators)

Prediction error on test set : 122.35591783250472
Prediction error on train set : 122.14713995474142


Trying linear regression

In [242]:
lr.predict(X_test)

array([ 74.7800411 ,  24.21144994,  54.7927342 , ..., 146.17335787,
       152.69062832, 170.22497593])

In [240]:
lr = LinearRegression()
lr_estimators = cross_validate(lr, X, y, return_estimator=True)['estimator']
lr = cv_worst(sgd_estimators)

Prediction error on test set : 122.35591783250472
Prediction error on train set : 122.14713995474142


In [27]:
%%time
for k in range(1, len(corr_features1) + 1):
    to_remove1 = corr_features1[k:]
    to_remove2 = corr_features2[k:]
    X1_train, X2_train = X_train.copy(), X_train.copy()
    X1_test, X2_test = X_test.copy(), X_test.copy()
    for f, _ in to_remove1:
        X1_train.drop(f, axis=1, inplace=True)
        X1_test.drop(f, axis=1, inplace=True)
    for f, _ in to_remove2:
        X2_train.drop(f, axis=1, inplace=True)
        X2_test.drop(f, axis=1, inplace=True)
    print("X1 features :", X1_train.columns)
    print("X2 features :", X2_train.columns)
    # Spearman k features
    
    sgd = SGDRegressor(max_iter=1000, tol=1e-4)
    sgd.fit(X1_train, y_train)
    y_pred = sgd.predict(X1_test)
    y_pred = map_y_inv(y_pred, mu, theta)
    print("Prediction error spearman for top", k, "features with SGD :", mean_absolute_error(y_true=y_test, y_pred=y_pred))
    
    lin_reg = LinearRegression()
    lin_reg.fit(X1_train, y_train)
    y_pred = lin_reg.predict(X1_test)
    y_pred = map_y_inv(y_pred, mu, theta)
    print("Prediction error spearman for top", k, "features with LinReg :", mean_absolute_error(y_true=y_test, y_pred=y_pred))
    
    #rf = RandomForestRegressor(max_depth=3, random_state=1)
    #rf.fit(X1_train, y_train)
    #y_pred = rf.predict(X1_test)
    #y_pred = map_y_inv(y_pred, mu, theta)
    #print("Prediction error spearman for top", k, "features with RF :", mean_absolute_error(y_true=y_test, y_pred=y_pred))
    
    # Pearson k features
    
    sgd = SGDRegressor(max_iter=1000, tol=1e-4)
    sgd.fit(X2_train, y_train)
    y_pred = sgd.predict(X2_test)
    y_pred = map_y_inv(y_pred, mu, theta)
    print("Prediction error pearson  for top", k, "features with SGD :", mean_absolute_error(y_true=y_test, y_pred=y_pred))
    
    lin_reg = LinearRegression()
    lin_reg.fit(X2_train, y_train)
    y_pred = lin_reg.predict(X2_test)
    y_pred = map_y_inv(y_pred, mu, theta)
    print("Prediction error pearson  for top", k, "features with LinReg :", mean_absolute_error(y_true=y_test, y_pred=y_pred))
    
    #rf = RandomForestRegressor(max_depth=3, random_state=1)
    #rf.fit(X2_train, y_train)
    #y_pred = rf.predict(X2_test)
    #y_pred = map_y_inv(y_pred, mu, theta)
    #print("Prediction error pearson  for top", k, "features with RF :", mean_absolute_error(y_true=y_test, y_pred=y_pred),'\n')

X1 features : Index(['size_text'], dtype='object')
X2 features : Index(['size_text'], dtype='object')
Prediction error spearman for top 1 features with SGD : 151.09559714420178
Prediction error spearman for top 1 features with LinReg : 151.070403636837
Prediction error pearson  for top 1 features with SGD : 151.06950744490172
Prediction error pearson  for top 1 features with LinReg : 151.070403636837
X1 features : Index(['hashtag_count', 'size_text'], dtype='object')
X2 features : Index(['size_text', 'user_followers_count'], dtype='object')
Prediction error spearman for top 2 features with SGD : 151.04894509697897
Prediction error spearman for top 2 features with LinReg : 151.04478456346942
Prediction error pearson  for top 2 features with SGD : 151.33287772737742
Prediction error pearson  for top 2 features with LinReg : 156.447955781189
X1 features : Index(['hashtag_count', 'size_text', 'user_followers_count'], dtype='object')
X2 features : Index(['hashtag_count', 'size_text', 'user_

In [30]:
# Spearman correlation : for non-linear correlations
corr_features1 = [(f, spearmanr(X[f], y).correlation) for f in X.columns]
corr_features1.sort(key=lambda x: abs(x[1]))
corr_features1.reverse()
for f, c in corr_features1:
    print(f, c)

user_followers_count 0.3221642112952114
hashtag_count -0.16490851466947581
user_verified 0.15683094634737596
user_statuses_count 0.083429547724099
size_text -0.0510814980325843
user_friends_count 0.015914010304418535
timestamp 0.014855101256478074


In [31]:
# Pearson correlation : for linear correlations
corr_features2 = [(f, X[f].corr(y)) for f in X.columns]
corr_features2.sort(key=lambda x: abs(x[1]))
corr_features2.reverse()
for f, c in corr_features2:
    print(f, c)

size_text 0.01298010552309922
user_followers_count 0.002728900490642274
user_statuses_count 0.0027003617908016803
user_friends_count 0.0010311370315126398
timestamp -0.0009214990207110241
user_verified 0.00047234255320145367
hashtag_count -0.0003838755670459721


In [22]:
def write_prediction_file(eval_data, y_pred):
    with open("gbr_predictions.txt", 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["TweetID", "NoRetweets"])
        for index, prediction in enumerate(y_pred):
            writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])