In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

In [2]:
# Load the training data
tweets = pd.read_csv("data/train.csv")

In [3]:
ransta = 0
X = tweets[tweets.retweet_count > 10].copy()
X = X[X.retweet_count < 1000]
X.shape

(86488, 11)

In [4]:
y = X.retweet_count.copy()
X = X[['user_verified', 'user_statuses_count', 'user_followers_count']]
# adding extra features : size of the text
#X.insert(0, 'size_text', tweets.text.apply(lambda x: len(x)), True)

# adding extra features : number of hashtags
def count_hashtags_in_text(text):
    return text.count('#')
#X.insert(0, 'hashtag_count', tweets.text.apply(count_hashtags_in_text), True)

# Converting timestamp in hour
def timestamp_13_digits_to_hour(t):
    dt = datetime.fromtimestamp(t / 1000)
    return dt.hour
#X.timestamp = X.timestamp.apply(timestamp_13_digits_to_hour)

# Converting the True / False values of user_verified into 1 / 0
X['user_verified'] = (X['user_verified']).astype(int)

X.head(3)

Unnamed: 0,user_verified,user_statuses_count,user_followers_count
17,0,15848,1257
24,0,10424,3083
43,0,24,146


In [5]:
X.user_statuses_count = np.log(1 + X.user_statuses_count)
X.user_followers_count = np.log(1 + X.user_followers_count)

In [6]:
#X.insert(3, 'stat_sqr', X.user_statuses_count ** 2, True)
#X.insert(4, 'followers_sqr', X.user_followers_count ** 2, True)
#X.insert(5, 'verified_exp', np.exp(X.user_verified), True)

In [7]:
# Center and normalize the datas
X = X - X.mean()
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head(3)

Unnamed: 0,user_verified,user_statuses_count,user_followers_count
0,0.0,0.602709,0.385741
1,0.0,0.574586,0.434204
2,0.0,0.169562,0.269712


In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [9]:
def cv_worst(estimators):
    mae_test, mae_train = 0, 0
    sgd = None
    for esti in estimators:
        y_test_sgd = esti.predict(X_test)
        y_test_sgd = y_test_sgd.astype(int)
        y_train_sgd = esti.predict(X_train)
        y_train_sgd = y_train_sgd.astype(int)
        cur_mae_test = mean_absolute_error(y_true=y_test, y_pred=y_test_sgd)
        cur_mae_train = mean_absolute_error(y_true=y_train, y_pred=y_train_sgd)
        if cur_mae_test > mae_test:
            mae_test = cur_mae_test
            mae_train = cur_mae_train
            sgd = esti
    print("Prediction error on test set :", mae_test)
    print("Prediction error on train set :", mae_train)
    
    return sgd

Trying SGD regression with huber loss

In [10]:
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(max_iter=1000, tol=1e-4, random_state=ransta, loss='huber')
sgd_estimators = cross_validate(sgd, X, y, return_estimator=True)['estimator']
sgd = cv_worst(sgd_estimators)

Prediction error on test set : 102.80922650017344
Prediction error on train set : 102.50742472043738


Trying linear regression

In [11]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(fit_intercept=False)
lr_estimators = cross_validate(lr, X, y, return_estimator=True)['estimator']
lr = cv_worst(lr_estimators)

Prediction error on test set : 120.77373106717539
Prediction error on train set : 120.55385606448523


Trying random forest regression with mae - poor implementation in sklearn with complexity in O(n^2)

In [12]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=1, max_depth=1, random_state=ransta, criterion='mae')
rf_estimators = cross_validate(rf, X, y, return_estimator=True)['estimator']
rf = cv_worst(rf_estimators)

Prediction error on test set : 101.72239565267661
Prediction error on train set : 101.34295766505343


Trying ridge regression : it does not improve the linear regression

In [13]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.5, fit_intercept=False, random_state=ransta)
ridge_estimators = cross_validate(ridge, X, y, return_estimator=True)['estimator']
ridge = cv_worst(ridge_estimators)

Prediction error on test set : 120.77430916869002
Prediction error on train set : 120.55370740489916


Trying bayesian ridge

In [14]:
from sklearn.linear_model import BayesianRidge
bay = BayesianRidge(fit_intercept=False)
bay_estimators = cross_validate(bay, X, y, return_estimator=True)['estimator']
bay = cv_worst(bay_estimators)

Prediction error on test set : 120.77384668747833
Prediction error on train set : 120.55362481624023


Trying generalized linear regression Power. Distribution
0 : Normal.
1 : Poisson.
2 : Gamma.
3 : Inverse Gaussian.

In [15]:
from sklearn.linear_model import TweedieRegressor
gen = TweedieRegressor(power=2, alpha=1, link='log', fit_intercept=False)
gen_estimators = cross_validate(gen, X, y, return_estimator=True)['estimator']
gen = cv_worst(gen_estimators)

Prediction error on test set : 108.22684703433923
Prediction error on train set : 107.76744685419798


Trying passive aggressive regression

In [16]:
from sklearn.linear_model import PassiveAggressiveRegressor
par = PassiveAggressiveRegressor(max_iter=100, random_state=ransta, tol=1e-3, fit_intercept=False)
par_estimators = cross_validate(par, X, y, return_estimator=True)['estimator']
par = cv_worst(par_estimators)

Prediction error on test set : 102.68593671715419
Prediction error on train set : 102.19758510761302


Trying gamma regression

In [17]:
from sklearn.linear_model import GammaRegressor
gam = GammaRegressor(alpha=0.001)
gam_estimators = cross_validate(gam, X, y, return_estimator=True)['estimator']
gam = cv_worst(gam_estimators)

Prediction error on test set : 120.2298146221143
Prediction error on train set : 120.08248955253464


Trying poisson regression

In [18]:
from sklearn.linear_model import PoissonRegressor
poi = PoissonRegressor(alpha=0.001)
poi_estimators = cross_validate(poi, X, y, return_estimator=True)['estimator']
poi = cv_worst(poi_estimators)

Prediction error on test set : 120.07962384861449
Prediction error on train set : 119.89253563700633


Trying k-nn regression : why does it fool the cross validation ???

In [19]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=1)
knn_estimators = cross_validate(knn, X, y, return_estimator=True)['estimator']
knn = cv_worst(knn_estimators)
knn.fit(X_train, y_train)
p = knn.predict(X_test).astype(int)
print("\nPrediction error on a train / test split :", mean_absolute_error(y_true=y_test, y_pred=p))

Prediction error on test set : 28.960342236096658
Prediction error on train set : 27.83893559736377

Prediction error on a train / test split : 129.66204185454967


Trying logistic regression - I am not sure about what it does and how it works

In [20]:
from sklearn.linear_model import LogisticRegression
sample = 100
logi = LogisticRegression()#penalty='l1', solver='saga', random_state=0, fit_intercept=False)
logi.fit(X_train[:sample], y_train[:sample])
p = logi.predict(X_test).astype(int)
print("\nPrediction error on a train / test split :", mean_absolute_error(y_true=y_test, y_pred=p))


Prediction error on a train / test split : 104.29571819478167


Trying neural network (MLP regression)

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
mlp = Sequential()
input_shape = X.iloc[0].shape
mlp.add(Dense(16, input_shape=input_shape, activation='relu'))
mlp.add(Dense(8, activation='relu'))
mlp.add(Dense(1, activation='linear'))
mlp.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_squared_error'])
mlp.fit(X_test, y_test, epochs=10, batch_size=10, verbose=1, validation_split=0.2)
p = mlp.predict(X_test).astype(int)
print("\nPrediction error on a train / test split :", mean_absolute_error(y_true=y_test, y_pred=p))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Prediction error on a train / test split : 101.47427448259914


Trying SVM regression (too long, but seems to have nice results)

In [23]:
from sklearn import svm
sample = 10000
svmr = svm.SVR(kernel='poly')
svmr_estimators = cross_validate(svmr, X[:sample], y[:sample], return_estimator=True)['estimator']
svmr = cv_worst(svmr_estimators)

Prediction error on test set : 99.85828804871468
Prediction error on train set : 99.49754711682992
