In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

In [389]:
# Load the training data
tweets = pd.read_csv("data/train.csv")

In [369]:
def select_features(lower_bound, upper_bound, tweets, ransta=0):
    assert lower_bound < upper_bound, "wrong bounds"
    keep_time = False
    X = tweets[tweets.retweet_count >= lower_bound].copy()
    X_big = X.copy()
    X = X[X.retweet_count < upper_bound]
    X_low = tweets[tweets.retweet_count < lower_bound].copy()
    y = X.retweet_count.copy()
    y_big = X_big.retweet_count.copy()
    y_low = X_low.retweet_count.copy()
    X = X[['text', 'timestamp', 'user_verified', 'user_statuses_count', 'user_followers_count']]
    X_big = X_big[['text', 'timestamp', 'user_verified', 'user_statuses_count', 'user_followers_count']]
    X_low = X_low[['text', 'timestamp', 'user_verified', 'user_statuses_count', 'user_followers_count']]
    # adding extra features : size of the text
    #X.insert(0, 'size_text', X.text.apply(lambda x: len(x)), True)
    #X_big.insert(0, 'size_text', X_big.text.apply(lambda x: len(x)), True)
    #X_low.insert(0, 'size_text', X_low.text.apply(lambda x: len(x)), True)

    # adding extra features : number of hashtags
    def count_hashtags_in_text(text):
        return text.count('#')
    #X.insert(0, 'hashtag_count', X.text.apply(count_hashtags_in_text), True)
    #X_big.insert(0, 'hashtag_count', X_big.text.apply(count_hashtags_in_text), True)
    #X_low.insert(0, 'hashtag_count', X_low.text.apply(count_hashtags_in_text), True)

    # Converting timestamp in hour
    def timestamp_13_digits_to_hour(t):
        dt = datetime.fromtimestamp(t / 1000)
        return dt.hour
    #X.timestamp = X.timestamp.apply(timestamp_13_digits_to_hour)
    #X_big.timestamp = X_big.timestamp.apply(timestamp_13_digits_to_hour)
    #X_low.timestamp = X_low.timestamp.apply(timestamp_13_digits_to_hour)
    #keep_time = True

    # Converting the True / False values of user_verified into 1 / 0
    X['user_verified'] = (X['user_verified']).astype(int)
    X_big['user_verified'] = (X_big['user_verified']).astype(int)
    X_low['user_verified'] = (X_low['user_verified']).astype(int)

    # Remove text feature
    X.drop('text', axis=1, inplace=True)
    X_big.drop('text', axis=1, inplace=True)
    X_low.drop('text', axis=1, inplace=True)
    # Eventually remove timestamp
    if not keep_time:
        X.drop('timestamp', axis=1, inplace=True)
        X_big.drop('timestamp', axis=1, inplace=True)
        X_low.drop('timestamp', axis=1, inplace=True)
    # Mapping extreme datas to a smaller interval
    if 'hashtag_count' in X.columns:
        X.loc[X.hashtag_count > 5, 'hashtag_count'] = 5
        X_big.loc[X_big.hashtag_count > 5, 'hashtag_count'] = 5
    if 'size_text' in X.columns:
        X.loc[X.size_text > 400, 'size_text'] = 400
        X_big.loc[X_big.size_text > 400, 'size_text'] = 400
    # Taking the log of some datas
    X.user_statuses_count = np.log(1 + X.user_statuses_count)
    X.user_followers_count = np.log(1 + X.user_followers_count)
    X_big.user_statuses_count = np.log(1 + X_big.user_statuses_count)
    X_big.user_followers_count = np.log(1 + X_big.user_followers_count)
    
    return X, X_big, X_low, y, y_big, y_low

In [370]:
def center_normalize(X, X_big):
    X = X - X.mean()
    scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    X_big = X_big - X_big.mean()
    scaler = MinMaxScaler()
    X_big = pd.DataFrame(scaler.fit_transform(X_big), columns=X_big.columns)
    
    return X, X_big

def prepare_data(lower_bound, upper_bound, tweets, ransta=0):
    X, X_big, X_low, y, y_big, y_low = select_features(lower_bound, upper_bound, tweets, ransta)
    X, X_big = center_normalize(X, X_big)
    
    return X, X_big, X_low, y, y_big, y_low

def cv_worst(estimators, X, y, verbose=True):
    mae_worst = 0
    esti_worst = None
    for esti in estimators:
        y_pred = esti.predict(X).astype(int)
        cur_mae = mean_absolute_error(y_true=y, y_pred=y_pred)
        if cur_mae > mae_worst:
            mae_worst = cur_mae
            esti_worst = esti
    if verbose:
        print("Prediction error :", mae_worst)
    
    return esti_worst

def error_true(predictor, X_big, y_big, X_low, y_low):
    y_low_pred = y_low * 0
    y_big_pred = predictor.predict(X_big).astype(int)
    y_pred = np.append(y_low_pred, y_big_pred)
    y_true = np.append(y_low, y_big)
    mae = mean_absolute_error(y_true=y_true, y_pred=y_pred)
    print("Prediction error on true datas :", mae)

Goal : beat the constant 0 prediction !

In [140]:
print("Constant 0 prediction error :", mean_absolute_error(tweets.retweet_count, tweets.retweet_count*0))

Constant 0 prediction error : 147.6873983330755


In [466]:
from sklearn.ensemble import RandomForestClassifier

def cluster(dataset, threshold):
    """
    Returns a copy of the parameter dataset, with an additional feature 'classes' (0, 1)
    """
    modified = pd.read_csv("data/train.csv")
    modified.user_verified = (modified.user_verified).astype(int)
    modified.user_mentions = modified.user_mentions.fillna(0)
    modified.user_mentions = modified.user_mentions.apply(lambda x: 1 if type(x) is str and x!=0 else 0) 
    modified.urls = modified.urls.fillna(0)

    modified.urls = modified.urls.apply(lambda x: 1 if type(x) is str and x!=0 else 0) 
    modified.hashtags = modified.hashtags.fillna(0)

    modified.hashtags = modified.hashtags.apply(lambda x: 1 if type(x) is str and x!=0 else 0) 
    modified.text = modified.text.fillna(0)

    modified.text = modified.text.apply(lambda x: len(x) if type(x) is str and x!=0 else 0) 

    x_less_300 = modified[modified.retweet_count<=threshold]
    x_greater_300 = modified[modified.retweet_count>threshold]

    x_sampled = pd.concat([x_less_300.sample(n=21112, random_state=1), x_greater_300], axis=0)

    y_sampled = x_sampled.retweet_count
    y_sampled = y_sampled.apply(lambda x: 1 if x>threshold else 0)

    x_sampled = x_sampled.drop('retweet_count', 1)

    #based on my test 2 was good
    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(x_sampled, y_sampled)

    # test
    #validation_data = pd.read_csv("data/evaluation.csv")
    validation_data = dataset.copy()

    if 'retweet_count' in dataset.columns:
        validation_data = validation_data.drop('retweet_count', axis=1)

    validation_data.user_verified = (validation_data.user_verified).astype(int)
    validation_data.user_mentions = validation_data.user_mentions.fillna(0)
    validation_data.user_mentions = validation_data.user_mentions.apply(lambda x: 1 if type(x) is str and x!=0 else 0) 
    validation_data.urls = validation_data.urls.fillna(0)

    validation_data.urls = validation_data.urls.apply(lambda x: 1 if type(x) is str and x!=0 else 0) 
    validation_data.hashtags = validation_data.hashtags.fillna(0)

    validation_data.hashtags = validation_data.hashtags.apply(lambda x: 1 if type(x) is str and x!=0 else 0) 
    validation_data.text = validation_data.text.fillna(0)

    validation_data.text = validation_data.text.apply(lambda x: len(x) if type(x) is str and x!=0 else 0)
    
    y_classes = clf.predict(validation_data) 
    #eval_dataset['classes'] = y_classes.tolist()
    tmp = dataset.copy()
    tmp['classes'] = y_classes.tolist()
    
    return tmp

In [467]:
def transform(X_val_big_ori):
    X_val_big = X_val_big_ori[['text', 'timestamp', 'user_verified', 'user_statuses_count', 'user_followers_count']].copy()
    # adding extra features : size of the text
    #X_val_big.insert(0, 'size_text', X_val_big.text.apply(lambda x: len(x)), True)
    # adding extra features : number of hashtags
    def count_hashtags_in_text(text):
        return text.count('#')
    #X_val_big.insert(0, 'hashtag_count', X_val_big.text.apply(count_hashtags_in_text), True)
    # Converting timestamp in hour
    def timestamp_13_digits_to_hour(t):
        dt = datetime.fromtimestamp(t / 1000)
        return dt.hour
    #X_val_big.timestamp = X_va_big.timestamp.apply(timestamp_13_digits_to_hour)
    # Converting the True / False values of user_verified into 1 / 0
    X_val_big['user_verified'] = (X_val_big['user_verified']).astype(int)
    # Remove text feature
    X_val_big.drop('text', axis=1, inplace=True)
    # Eventually remove timestamp
    X_val_big.drop('timestamp', axis=1, inplace=True)
    # Mapping extreme datas to a smaller interval
    if 'hashtag_count' in X.columns:
        X_val_big.loc[X_val_big.hashtag_count > 5, 'hashtag_count'] = 5
    if 'size_text' in X.columns:
        X_val_big.loc[X_val_big.size_text > 400, 'size_text'] = 400
    # Taking the log of some datas
    X_val_big.user_statuses_count = np.log(1 + X_val_big.user_statuses_count)
    X_val_big.user_followers_count = np.log(1 + X_val_big.user_followers_count)
    
    return X_val_big

def center_normalize_X(X_ori):
    X = X_ori.copy()
    X = X - X.mean()
    scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    return X

In [521]:
X, X_big, X_low, y, y_big, y_low = prepare_data(20, 300, tweets)
# Applying linear regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression(fit_intercept=False)
lr_estimators = cross_validate(lr, X, y, return_estimator=True)['estimator']
lr = cv_worst(lr_estimators, X, y, verbose=False)
print('[', lower_bound, ',', upper_bound, "] :", end='')
error_true(lr, X_big, y_big, X_low, y_low)

[ 50 , 2000 ] :Prediction error on true datas : 142.28858311416585


In [506]:
ds = pd.read_csv("data/train.csv")
tweets2 = cluster(ds, 3000)

In [513]:
tweets[tweets.retweet_count > 200].shape

(26308, 11)

In [522]:
X_val_big = tweets2[tweets2.classes == 1].copy()
X_val_low = tweets2[tweets2.classes == 0].copy()
print("class 0 :", X_val_low.shape)
print("class 1 :", X_val_big.shape)
y_true_big = tweets2[tweets2.classes == 1].retweet_count.copy()
y_true_low = tweets2[tweets2.classes == 0].retweet_count.copy()
X_id_big = X_val_big.id.copy()
X_val_big = transform(X_val_big)
X_val_big = center_normalize_X(X_val_big)

model = lr
y_pred_low = X_val_low.id * 0 # constant zero prediction
y_pred_big = model.predict(X_val_big)
y_true = np.append(y_true_low, y_true_big)
y_pred = np.append(y_pred_low, y_pred_big)
mae = mean_absolute_error(y_true=y_true, y_pred=y_pred)
print("Prediction error on true datas :", mae)

class 0 : (636793, 12)
class 1 : (28984, 12)
Prediction error on true datas : 146.76942036006326


In [460]:
X_val_big = tweets2[tweets2.classes == 1].copy()
X_val_low = tweets2[tweets2.classes == 0].copy()
print("class 0 :", X_val_low.shape)
print("class 1 :", X_val_big.shape)
X_id_big = X_val_big.id.copy()
X_val_big = transform(X_val_big)
X_val_big = center_normalize_X(X_val_big)

model = gbr
y_pred_low = X_val_low.id * 0 # constant zero prediction
y_pred_big = model.predict(X_val_big)

with open("predictions.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_pred_low):
        writer.writerow([str(X_val_low['id'].iloc[index]) , str(int(prediction))])
    for index, prediction in enumerate(y_pred_big):
        writer.writerow([str(X_id_big.iloc[index]) , str(int(prediction))])

class 0 : (274972, 11)
class 1 : (10362, 11)


In [269]:
model = lr
# X_val_big = eval_data with more than split retweets
# X_val_low = eval_data with less than split retweets
y_pred_low = X_val_low.id * 0 # constant zero prediction

y_pred_big = model.predict(X_val_big)

with open("predictions.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_pred_low):
        writer.writerow([str(X_val_low['id'].iloc[index]) , str(int(prediction))])
    for index, prediction in enumerate(y_pred_big):
        writer.writerow([str(X_id_big.iloc[index]) , str(int(prediction))])

In [46]:
%%time
for lower_bound in range(4500, 6001, 100):
    for upper_bound in range(25000, 35002, 2000):
        X, X_big, X_low, y, y_big, y_low = prepare_data(lower_bound, upper_bound, tweets)
        # Applying linear regression
        from sklearn.linear_model import LinearRegression
        lr = LinearRegression(fit_intercept=False)
        lr_estimators = cross_validate(lr, X, y, return_estimator=True)['estimator']
        lr = cv_worst(lr_estimators, X, y, verbose=False)
        print('[', lower_bound, ',', upper_bound, "] :", end='')
        error_true(lr, X_big, y_big, X_low, y_low)
    print('')

[ 4500 , 25000 ] :Prediction error on true datas : 113.04612505388441
[ 4500 , 27000 ] :Prediction error on true datas : 113.02099501785132
[ 4500 , 29000 ] :Prediction error on true datas : 112.95723342800967
[ 4500 , 31000 ] :Prediction error on true datas : 112.9805805847904
[ 4500 , 33000 ] :Prediction error on true datas : 113.0552737628365
[ 4500 , 35000 ] :Prediction error on true datas : 113.1783645274619

[ 4600 , 25000 ] :Prediction error on true datas : 112.97626983209092
[ 4600 , 27000 ] :Prediction error on true datas : 112.93824208406117
[ 4600 , 29000 ] :Prediction error on true datas : 112.8706969450732
[ 4600 , 31000 ] :Prediction error on true datas : 112.8860639523444
[ 4600 , 33000 ] :Prediction error on true datas : 112.9545208080183
[ 4600 , 35000 ] :Prediction error on true datas : 113.06285287716457

[ 4700 , 25000 ] :Prediction error on true datas : 112.90294648208034
[ 4700 , 27000 ] :Prediction error on true datas : 112.85180923943003
[ 4700 , 29000 ] :Predic

In [428]:
ransta = 0
lower_bound = 50
upper_bound = 2000
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=ransta)

Trying SGD regression with huber loss

In [429]:
%%time
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(max_iter=1000, tol=1e-4, random_state=ransta, loss='huber')
sgd_estimators = cross_validate(sgd, X, y, return_estimator=True)['estimator']
sgd = cv_worst(sgd_estimators, X, y)
error_true(sgd, X_big, y_big, X_low, y_low)

Prediction error : 243.8984762540318
Prediction error on true datas : 139.82123894336993
Wall time: 5.93 s


Trying linear regression

In [430]:
%%time
from sklearn.linear_model import LinearRegression
lr = LinearRegression(fit_intercept=False)
lr_estimators = cross_validate(lr, X, y, return_estimator=True)['estimator']
lr = cv_worst(lr_estimators, X, y)
error_true(lr, X_big, y_big, X_low, y_low)

Prediction error : 276.52121009898786
Prediction error on true datas : 139.95671974249638
Wall time: 46.9 ms


Trying ridge regression : it does not improve the linear regression

In [431]:
%%time
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.5, fit_intercept=False, random_state=ransta)
ridge_estimators = cross_validate(ridge, X, y, return_estimator=True)['estimator']
ridge = cv_worst(ridge_estimators, X, y)
error_true(ridge, X_big, y_big, X_low, y_low)

Prediction error : 276.5223000778556
Prediction error on true datas : 139.9572649701026
Wall time: 63.3 ms


Trying bayesian ridge

In [432]:
%%time
from sklearn.linear_model import BayesianRidge
bay = BayesianRidge(fit_intercept=False)
bay_estimators = cross_validate(bay, X, y, return_estimator=True)['estimator']
bay = cv_worst(bay_estimators, X, y)
error_true(bay, X_big, y_big, X_low, y_low)

Prediction error : 276.52405739072407
Prediction error on true datas : 139.95749778078846
Wall time: 61.3 ms


Trying generalized linear regression Power. Distribution
0 : Normal.
1 : Poisson.
2 : Gamma.
3 : Inverse Gaussian.

In [433]:
%%time
from sklearn.linear_model import TweedieRegressor
gen = TweedieRegressor(power=2, alpha=1, link='log', fit_intercept=False)
gen_estimators = cross_validate(gen, X, y, return_estimator=True)['estimator']
gen = cv_worst(gen_estimators, X, y)
error_true(gen, X_big, y_big, X_low, y_low)

Prediction error : 279.8728061394728
Prediction error on true datas : 142.5287446096816
Wall time: 222 ms


Trying passive aggressive regression

In [434]:
%%time
from sklearn.linear_model import PassiveAggressiveRegressor
par = PassiveAggressiveRegressor(max_iter=100, random_state=ransta, tol=1e-3, fit_intercept=False)
par_estimators = cross_validate(par, X, y, return_estimator=True)['estimator']
par = cv_worst(par_estimators, X, y)
error_true(par, X_big, y_big, X_low, y_low)

Prediction error : 241.97146034923813
Prediction error on true datas : 139.22334204996568
Wall time: 141 ms


Trying gamma regression

In [435]:
%%time
from sklearn.linear_model import GammaRegressor
gam = GammaRegressor(alpha=0.001)
gam_estimators = cross_validate(gam, X, y, return_estimator=True)['estimator']
gam = cv_worst(gam_estimators, X, y)
error_true(gam, X_big, y_big, X_low, y_low)

Prediction error : 278.37459681904124
Prediction error on true datas : 140.09430785383094
Wall time: 191 ms


Trying poisson regression

In [436]:
%%time
from sklearn.linear_model import PoissonRegressor
poi = PoissonRegressor(alpha=0.001)
poi_estimators = cross_validate(poi, X, y, return_estimator=True)['estimator']
poi = cv_worst(poi_estimators, X, y)
error_true(poi, X_big, y_big, X_low, y_low)

Prediction error : 278.06869091313536
Prediction error on true datas : 139.99452669587563
Wall time: 222 ms


Trying k-nn regression : why does it fool the cross validation ??? Very sensitive to feature change

In [16]:
%%time
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=1)
knn_estimators = cross_validate(knn, X, y, return_estimator=True)['estimator']
knn = cv_worst(knn_estimators, X, y)
error_true(knn, X_big, y_big, X_low, y_low)
knn.fit(X_train, y_train)
p = knn.predict(X_test).astype(int)
print("\nPrediction error on a train / test split :", mean_absolute_error(y_true=y_test, y_pred=p))

Prediction error : 27.86419889992903
Prediction error on true datas : 144.75071532960737

Prediction error on a train / test split : 126.74427974716298
Wall time: 34 s


Trying logistic regression - I am not sure about what it does and how it works

In [17]:
%%time
from sklearn.linear_model import LogisticRegression
sample = 100
logi = LogisticRegression() # penalty='l1', solver='saga', random_state=0, fit_intercept=False)
logi.fit(X_train[:sample], y_train[:sample])
p = logi.predict(X_test).astype(int)
print("\nPrediction error on a train / test split :", mean_absolute_error(y_true=y_test, y_pred=p))
error_true(logi, X_big, y_big, X_low, y_low)


Prediction error on a train / test split : 111.02007171108565
Prediction error on true datas : 146.0392008134856
Wall time: 194 ms


Trying gradient boosting regression

In [437]:
%%time
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(loss='lad', criterion='friedman_mse', random_state=ransta)
gbr_estimators = cross_validate(gbr, X, y, return_estimator=True)['estimator']
gbr = cv_worst(gbr_estimators, X, y)
error_true(gbr, X_big, y_big, X_low, y_low)

Prediction error : 235.76716716716717
Prediction error on true datas : 138.66486526269307
Wall time: 12.4 s


Trying neural network (MLP regression)

In [19]:
%%time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
mlp = Sequential()
input_shape = X.iloc[0].shape
mlp.add(Dense(16, input_shape=input_shape, activation='relu'))
mlp.add(Dense(8, activation='relu'))
mlp.add(Dense(1, activation='linear'))
mlp.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_squared_error'])
mlp.fit(X_test, y_test, epochs=10, batch_size=10, verbose=0, validation_split=0.2)
p = mlp.predict(X_test).astype(int)
print("Prediction error on a train / test split :", mean_absolute_error(y_true=y_test, y_pred=p))
error_true(mlp, X_big, y_big, X_low, y_low)

Prediction error on a train / test split : 98.47406941928807
Prediction error on true datas : 143.59846915709014
Wall time: 41.8 s


Trying SVM regression (too long, but seems to have nice results)

In [20]:
%%time
from sklearn import svm
sample = 5000
svmr = svm.SVR(kernel='poly')
svmr_estimators = cross_validate(svmr, X[:sample], y[:sample], return_estimator=True)['estimator']
svmr = cv_worst(svmr_estimators, X, y)
error_true(svmr, X_big, y_big, X_low, y_low)

Prediction error : 97.06281050390348
Prediction error on true datas : 143.40786930158296
Wall time: 39.7 s


Trying random forest regression with mse

In [21]:
%%time
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, max_depth=2, random_state=ransta, criterion='mse')
rf_estimators = cross_validate(rf, X, y, return_estimator=True)['estimator']
rf = cv_worst(rf_estimators, X, y)
rf_pred = (rf.predict(X_test)).astype(int)
error_true(rf, X_big, y_big, X_low, y_low)

Prediction error : 117.40592840667139
Prediction error on true datas : 144.85139018019547
Wall time: 31.8 s


Trying random forest regression with mae - poor implementation in sklearn with complexity in O(n^2)

In [None]:
%%time
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=1, max_depth=1, random_state=ransta, criterion='mae')
rf_estimators = cross_validate(rf, X, y, return_estimator=True)['estimator']
rf = cv_worst(rf_estimators, X, y)
rf_pred = (rf.predict(X_test)).astype(int)
val, occ = np.unique(rf_pred, return_counts=True)
print('\n', val, "with respecting occurences", occ)
error_true(rf, X_big, y_big, X_low, y_low)

In [106]:
#X.insert(3, 'stat_sqr', X.user_statuses_count ** 2, True)
#X.insert(4, 'followers_sqr', X.user_followers_count ** 2, True)
#X.insert(5, 'verified_exp', np.exp(X.user_verified), True)

#X_big.insert(3, 'stat_sqr', X_big.user_statuses_count ** 2, True)
#X_big.insert(4, 'followers_sqr', X_big.user_followers_count ** 2, True)
#X_big.insert(5, 'verified_exp', np.exp(X_big.user_verified), True)