In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

In [2]:
# Load the training data
tweets = pd.read_csv("data/train.csv")

In [3]:
def select_features(lower_bound, upper_bound, tweets, ransta=0):
    assert lower_bound < upper_bound, "wrong bounds"
    keep_time = False
    X = tweets[tweets.retweet_count >= lower_bound].copy()
    X_big = X.copy()
    X = X[X.retweet_count < upper_bound]
    X_low = tweets[tweets.retweet_count < lower_bound].copy()
    y = X.retweet_count.copy()
    y_big = X_big.retweet_count.copy()
    y_low = X_low.retweet_count.copy()
    X = X[['text', 'timestamp', 'user_verified', 'user_statuses_count', 'user_followers_count']]
    X_big = X_big[['text', 'timestamp', 'user_verified', 'user_statuses_count', 'user_followers_count']]
    X_low = X_low[['text', 'timestamp', 'user_verified', 'user_statuses_count', 'user_followers_count']]
    # adding extra features : size of the text
    #X.insert(0, 'size_text', X.text.apply(lambda x: len(x)), True)
    #X_big.insert(0, 'size_text', X_big.text.apply(lambda x: len(x)), True)
    #X_low.insert(0, 'size_text', X_low.text.apply(lambda x: len(x)), True)

    # adding extra features : number of hashtags
    def count_hashtags_in_text(text):
        return text.count('#')
    #X.insert(0, 'hashtag_count', X.text.apply(count_hashtags_in_text), True)
    #X_big.insert(0, 'hashtag_count', X_big.text.apply(count_hashtags_in_text), True)
    #X_low.insert(0, 'hashtag_count', X_low.text.apply(count_hashtags_in_text), True)

    # Converting timestamp in hour
    def timestamp_13_digits_to_hour(t):
        dt = datetime.fromtimestamp(t / 1000)
        return dt.hour
    #X.timestamp = X.timestamp.apply(timestamp_13_digits_to_hour)
    #X_big.timestamp = X_big.timestamp.apply(timestamp_13_digits_to_hour)
    #X_low.timestamp = X_low.timestamp.apply(timestamp_13_digits_to_hour)
    #keep_time = True

    # Converting the True / False values of user_verified into 1 / 0
    X['user_verified'] = (X['user_verified']).astype(int)
    X_big['user_verified'] = (X_big['user_verified']).astype(int)
    X_low['user_verified'] = (X_low['user_verified']).astype(int)

    # Remove text feature
    X.drop('text', axis=1, inplace=True)
    X_big.drop('text', axis=1, inplace=True)
    X_low.drop('text', axis=1, inplace=True)
    # Eventually remove timestamp
    if not keep_time:
        X.drop('timestamp', axis=1, inplace=True)
        X_big.drop('timestamp', axis=1, inplace=True)
        X_low.drop('timestamp', axis=1, inplace=True)
    # Mapping extreme datas to a smaller interval
    if 'hashtag_count' in X.columns:
        X.loc[X.hashtag_count > 5, 'hashtag_count'] = 5
        X_big.loc[X_big.hashtag_count > 5, 'hashtag_count'] = 5
    if 'size_text' in X.columns:
        X.loc[X.size_text > 400, 'size_text'] = 400
        X_big.loc[X_big.size_text > 400, 'size_text'] = 400
    # Taking the log of some datas
    X.user_statuses_count = np.log(1 + X.user_statuses_count)
    X.user_followers_count = np.log(1 + X.user_followers_count)
    X_big.user_statuses_count = np.log(1 + X_big.user_statuses_count)
    X_big.user_followers_count = np.log(1 + X_big.user_followers_count)
    
    return X, X_big, X_low, y, y_big, y_low

In [4]:
def center_normalize(X, X_big):
    X = X - X.mean()
    scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    X_big = X_big - X_big.mean()
    scaler = MinMaxScaler()
    X_big = pd.DataFrame(scaler.fit_transform(X_big), columns=X_big.columns)
    
    return X, X_big

def prepare_data(lower_bound, upper_bound, tweets, ransta=0):
    X, X_big, X_low, y, y_big, y_low = select_features(lower_bound, upper_bound, tweets, ransta)
    X, X_big = center_normalize(X, X_big)
    
    return X, X_big, X_low, y, y_big, y_low

def cv_worst(estimators, X, y, verbose=True):
    mae_worst = 0
    esti_worst = None
    for esti in estimators:
        y_pred = esti.predict(X).astype(int)
        cur_mae = mean_absolute_error(y_true=y, y_pred=y_pred)
        if cur_mae > mae_worst:
            mae_worst = cur_mae
            esti_worst = esti
    if verbose:
        print("Prediction error :", mae_worst)
    
    return esti_worst

def error_true(predictor, X_big, y_big, X_low, y_low):
    y_low_pred = y_low * 0
    y_big_pred = predictor.predict(X_big).astype(int)
    y_pred = np.append(y_low_pred, y_big_pred)
    y_true = np.append(y_low, y_big)
    mae = mean_absolute_error(y_true=y_true, y_pred=y_pred)
    print("Prediction error on true datas :", mae)

Goal : beat the constant 0 prediction !

In [5]:
print("Constant 0 prediction error :", mean_absolute_error(tweets.retweet_count, tweets.retweet_count*0))

Constant 0 prediction error : 147.6873983330755


Trying to run the linear regression for different bounds

In [6]:
%%time
for lower_bound in range(290, 500, 10):
    for upper_bound in range(1000, 4002, 200):
        X, X_big, X_low, y, y_big, y_low = prepare_data(lower_bound, upper_bound, tweets)
        # Applying linear regression
        from sklearn.linear_model import LinearRegression
        lr = LinearRegression(fit_intercept=False)
        lr_estimators = cross_validate(lr, X, y, return_estimator=True)['estimator']
        lr = cv_worst(lr_estimators, X, y, verbose=False)
        print('[', lower_bound, ',', upper_bound, "] :", end='')
        error_true(lr, X_big, y_big, X_low, y_low)
    print('')

[ 290 , 1000 ] :Prediction error on true datas : 133.055533609602
[ 290 , 1200 ] :Prediction error on true datas : 132.40877801426004
[ 290 , 1400 ] :Prediction error on true datas : 131.95592968816885
[ 290 , 1600 ] :Prediction error on true datas : 131.62538657838886
[ 290 , 1800 ] :Prediction error on true datas : 131.3451486008078
[ 290 , 2000 ] :Prediction error on true datas : 131.16585883862012
[ 290 , 2200 ] :Prediction error on true datas : 131.0211977884487
[ 290 , 2400 ] :Prediction error on true datas : 130.9324773910784
[ 290 , 2600 ] :Prediction error on true datas : 130.8701847615643
[ 290 , 2800 ] :Prediction error on true datas : 130.81501163302428
[ 290 , 3000 ] :Prediction error on true datas : 130.78782234892464
[ 290 , 3200 ] :Prediction error on true datas : 130.7825127632826
[ 290 , 3400 ] :Prediction error on true datas : 130.80162576958952
[ 290 , 3600 ] :Prediction error on true datas : 130.82281454601164
[ 290 , 3800 ] :Prediction error on true datas : 130.87

KeyboardInterrupt: 

In [7]:
ransta = 0
lower_bound = 10
upper_bound = 1000
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=ransta)

Trying SGD regression with huber loss

In [8]:
%%time
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(max_iter=1000, tol=1e-4, random_state=ransta, loss='huber')
sgd_estimators = cross_validate(sgd, X, y, return_estimator=True)['estimator']
sgd = cv_worst(sgd_estimators, X, y)
error_true(sgd, X_big, y_big, X_low, y_low)

Prediction error : 99.73031627040454
Prediction error on true datas : 144.09929000250835
Wall time: 5.42 s


Trying linear regression

In [9]:
%%time
from sklearn.linear_model import LinearRegression
lr = LinearRegression(fit_intercept=False)
lr_estimators = cross_validate(lr, X, y, return_estimator=True)['estimator']
lr = cv_worst(lr_estimators, X, y)
error_true(lr, X_big, y_big, X_low, y_low)

Prediction error : 117.89967397090135
Prediction error on true datas : 144.8003280377664
Wall time: 178 ms


Trying ridge regression : it does not improve the linear regression

In [10]:
%%time
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.5, fit_intercept=False, random_state=ransta)
ridge_estimators = cross_validate(ridge, X, y, return_estimator=True)['estimator']
ridge = cv_worst(ridge_estimators, X, y)
error_true(ridge, X_big, y_big, X_low, y_low)

Prediction error : 117.90016190560681
Prediction error on true datas : 144.80072156292573
Wall time: 173 ms


Trying bayesian ridge

In [11]:
%%time
from sklearn.linear_model import BayesianRidge
bay = BayesianRidge(fit_intercept=False)
bay_estimators = cross_validate(bay, X, y, return_estimator=True)['estimator']
bay = cv_worst(bay_estimators, X, y)
error_true(bay, X_big, y_big, X_low, y_low)

Prediction error : 117.89957416607523
Prediction error on true datas : 144.80068851882837
Wall time: 203 ms


Trying generalized linear regression Power. Distribution
0 : Normal.
1 : Poisson.
2 : Gamma.
3 : Inverse Gaussian.

In [12]:
%%time
from sklearn.linear_model import TweedieRegressor
gen = TweedieRegressor(power=2, alpha=1, link='log', fit_intercept=False)
gen_estimators = cross_validate(gen, X, y, return_estimator=True)['estimator']
gen = cv_worst(gen_estimators, X, y)
error_true(gen, X_big, y_big, X_low, y_low)

Prediction error : 104.27812278211498
Prediction error on true datas : 144.67818203392426
Wall time: 1.07 s


Trying passive aggressive regression

In [13]:
%%time
from sklearn.linear_model import PassiveAggressiveRegressor
par = PassiveAggressiveRegressor(max_iter=100, random_state=ransta, tol=1e-3, fit_intercept=False)
par_estimators = cross_validate(par, X, y, return_estimator=True)['estimator']
par = cv_worst(par_estimators, X, y)
error_true(par, X_big, y_big, X_low, y_low)

Prediction error : 98.82284643364088
Prediction error on true datas : 143.9007730816775
Wall time: 797 ms


Trying gamma regression

In [14]:
%%time
from sklearn.linear_model import GammaRegressor
gam = GammaRegressor(alpha=0.001)
gam_estimators = cross_validate(gam, X, y, return_estimator=True)['estimator']
gam = cv_worst(gam_estimators, X, y)
error_true(gam, X_big, y_big, X_low, y_low)

Prediction error : 117.18571460255501
Prediction error on true datas : 144.71702537035674
Wall time: 1.09 s


Trying poisson regression

In [15]:
%%time
from sklearn.linear_model import PoissonRegressor
poi = PoissonRegressor(alpha=0.001)
poi_estimators = cross_validate(poi, X, y, return_estimator=True)['estimator']
poi = cv_worst(poi_estimators, X, y)
error_true(poi, X_big, y_big, X_low, y_low)

Prediction error : 116.97436124911285
Prediction error on true datas : 144.53743520728412
Wall time: 910 ms


Trying k-nn regression : why does it fool the cross validation ??? Very sensitive to feature change

In [16]:
%%time
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=1)
knn_estimators = cross_validate(knn, X, y, return_estimator=True)['estimator']
knn = cv_worst(knn_estimators, X, y)
error_true(knn, X_big, y_big, X_low, y_low)
knn.fit(X_train, y_train)
p = knn.predict(X_test).astype(int)
print("\nPrediction error on a train / test split :", mean_absolute_error(y_true=y_test, y_pred=p))

Prediction error : 27.86419889992903
Prediction error on true datas : 144.75071532960737

Prediction error on a train / test split : 126.74427974716298
Wall time: 34 s


Trying logistic regression - I am not sure about what it does and how it works

In [17]:
%%time
from sklearn.linear_model import LogisticRegression
sample = 100
logi = LogisticRegression() # penalty='l1', solver='saga', random_state=0, fit_intercept=False)
logi.fit(X_train[:sample], y_train[:sample])
p = logi.predict(X_test).astype(int)
print("\nPrediction error on a train / test split :", mean_absolute_error(y_true=y_test, y_pred=p))
error_true(logi, X_big, y_big, X_low, y_low)


Prediction error on a train / test split : 111.02007171108565
Prediction error on true datas : 146.0392008134856
Wall time: 194 ms


Trying gradient boosting regression

In [18]:
%%time
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(loss='lad', criterion='friedman_mse', random_state=ransta)
gbr_estimators = cross_validate(gbr, X, y, return_estimator=True)['estimator']
gbr = cv_worst(gbr_estimators, X, y)
error_true(gbr, X_big, y_big, X_low, y_low)

Prediction error : 94.91043071327182
Prediction error on true datas : 142.69297527550515
Wall time: 1min 9s


Trying neural network (MLP regression)

In [19]:
%%time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
mlp = Sequential()
input_shape = X.iloc[0].shape
mlp.add(Dense(16, input_shape=input_shape, activation='relu'))
mlp.add(Dense(8, activation='relu'))
mlp.add(Dense(1, activation='linear'))
mlp.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_squared_error'])
mlp.fit(X_test, y_test, epochs=10, batch_size=10, verbose=0, validation_split=0.2)
p = mlp.predict(X_test).astype(int)
print("Prediction error on a train / test split :", mean_absolute_error(y_true=y_test, y_pred=p))
error_true(mlp, X_big, y_big, X_low, y_low)

Prediction error on a train / test split : 98.47406941928807
Prediction error on true datas : 143.59846915709014
Wall time: 41.8 s


Trying SVM regression (too long, but seems to have nice results)

In [20]:
%%time
from sklearn import svm
sample = 5000
svmr = svm.SVR(kernel='poly')
svmr_estimators = cross_validate(svmr, X[:sample], y[:sample], return_estimator=True)['estimator']
svmr = cv_worst(svmr_estimators, X, y)
error_true(svmr, X_big, y_big, X_low, y_low)

Prediction error : 97.06281050390348
Prediction error on true datas : 143.40786930158296
Wall time: 39.7 s


Trying random forest regression with mse

In [21]:
%%time
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, max_depth=2, random_state=ransta, criterion='mse')
rf_estimators = cross_validate(rf, X, y, return_estimator=True)['estimator']
rf = cv_worst(rf_estimators, X, y)
rf_pred = (rf.predict(X_test)).astype(int)
error_true(rf, X_big, y_big, X_low, y_low)

Prediction error : 117.40592840667139
Prediction error on true datas : 144.85139018019547
Wall time: 31.8 s


Trying random forest regression with mae - poor implementation in sklearn with complexity in O(n^2)

In [None]:
%%time
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=1, max_depth=1, random_state=ransta, criterion='mae')
rf_estimators = cross_validate(rf, X, y, return_estimator=True)['estimator']
rf = cv_worst(rf_estimators, X, y)
rf_pred = (rf.predict(X_test)).astype(int)
val, occ = np.unique(rf_pred, return_counts=True)
print('\n', val, "with respecting occurences", occ)
error_true(rf, X_big, y_big, X_low, y_low)

In [106]:
#X.insert(3, 'stat_sqr', X.user_statuses_count ** 2, True)
#X.insert(4, 'followers_sqr', X.user_followers_count ** 2, True)
#X.insert(5, 'verified_exp', np.exp(X.user_verified), True)

#X_big.insert(3, 'stat_sqr', X_big.user_statuses_count ** 2, True)
#X_big.insert(4, 'followers_sqr', X_big.user_followers_count ** 2, True)
#X_big.insert(5, 'verified_exp', np.exp(X_big.user_verified), True)