In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from datetime import datetime

In [2]:
# Load the training data
tweet_dataset = pd.read_csv("data/train.csv")

In [3]:
tweet_dataset.head(1)

Unnamed: 0,id,timestamp,retweet_count,user_verified,user_statuses_count,user_followers_count,user_friends_count,user_mentions,urls,hashtags,text
0,0,1588696955143,0,False,68460,1101,1226,,,,Smh I give up


In [4]:
X = tweet_dataset[['timestamp', 'user_verified', 'user_followers_count', 'user_friends_count']].copy()
y = tweet_dataset['retweet_count'].copy()#

# Converting the True / False values of user_verified into 1 / 0
X['user_verified'] = (X['user_verified']).astype(int)

# Converting timestamp in hour
def timestamp_13_digits_to_mins(t):
    dt = datetime.fromtimestamp(t / 1000)
    return dt.hour * 60 + dt.minute

X.timestamp = X.timestamp.apply(timestamp_13_digits_to_mins)

# Adding extra features

# size of the text
X.insert(0, 'size_text', tweet_dataset.text.apply(lambda x: len(x)), True)

# number of hashtags
def count_hashtags_in_text(text):
    return text.count('#')

def count_hashtags(hashtags):
    if type(hashtags) is str:
        return len(hashtags.split(','))
    return 0

X.insert(0, 'hashtag_number', tweet_dataset.text.apply(count_hashtags_in_text, True))

# number of mentions
def count_mentions_in_text(text):
    return text.count('@')

X.insert(0, 'mention_number', tweet_dataset.text.apply(count_mentions_in_text, True))

X.head(3)

Unnamed: 0,mention_number,hashtag_number,size_text,timestamp,user_verified,user_followers_count,user_friends_count
0,0,0,13,1122,0,1101,1226
1,0,0,69,135,0,51,202
2,0,0,99,84,0,1675,2325


In [5]:
corr_features1 = [(f, spearmanr(X[f], tweet_dataset.retweet_count).correlation) for f in X.columns]
corr_features1.sort(key=lambda x: abs(x[1]))
corr_features1.reverse()
for f, c in corr_features1:
    print(f, c)

size_text 0.718833034104086
hashtag_number 0.5610912442719814
user_followers_count 0.5571158977128619
user_verified 0.48795218772539445
mention_number 0.4531966426675053
user_friends_count 0.14457479227322048
timestamp -0.045878260587928786


In [6]:
corr_features2 = [(f, X[f].corr(tweet_dataset.retweet_count)) for f in X.columns]
corr_features2.sort(key=lambda x: abs(x[1]))
corr_features2.reverse()
for f, c in corr_features2:
    print(f, c)

user_followers_count 0.13285282331306936
user_verified 0.05792252371025388
size_text 0.03288040359219733
user_friends_count 0.025327227979688026
timestamp -0.0065666916824623734
mention_number -0.0036770093961248163
hashtag_number -0.003628897027931208


In [7]:
# Center and normalize the datas
X = X - X.mean()
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head(1)

Unnamed: 0,mention_number,hashtag_number,size_text,timestamp,user_verified,user_followers_count,user_friends_count
0,0.0,0.0,0.01217,0.779708,0.0,9e-06,0.000279


In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Trying SGD regression

In [9]:
reg = SGDRegressor(max_iter=1000, tol=1e-4)
%time reg.fit(X_train, y_train)

Wall time: 6.85 s


SGDRegressor(tol=0.0001)

In [10]:
y_pred = reg.predict(X_test)
y_pred = y_pred.astype(int)
print("Prediction error:", mean_absolute_error(y_true=y_test, y_pred=y_pred))
reg.score(X_train, y_train)

Prediction error: 258.07053881662614


0.018111863469909673

Trying linear regression

In [11]:
lin_reg = LinearRegression()
%time lin_reg.fit(X_train, y_train)

Wall time: 63.7 ms


LinearRegression()

In [12]:
y_pred = lin_reg.predict(X_test)
y_pred = y_pred.astype(int)
print("Prediction error:", mean_absolute_error(y_true=y_test, y_pred=y_pred))
lin_reg.score(X_train, y_train)

Prediction error: 246.60315719907476


0.019040868316518855

In [13]:
%%time
for k in range(1, len(corr_features1) + 1):
    to_remove1 = corr_features1[k:]
    to_remove2 = corr_features2[k:]
    X1, X2 = X.copy(), X.copy()
    for f, _ in to_remove1:
        X1.drop(f, axis=1, inplace=True)
    for f, _ in to_remove2:
        X2.drop(f, axis=1, inplace=True)
    # Center and normalize the datas
    X1 = X1 - X1.mean()
    scaler = MinMaxScaler()
    X1 = pd.DataFrame(scaler.fit_transform(X1), columns=X1.columns)
    
    X2 = X2 - X2.mean()
    scaler = MinMaxScaler()
    X2 = pd.DataFrame(scaler.fit_transform(X2), columns=X2.columns)
    
    # Train test split
    X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.3, random_state=1)
    X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.3, random_state=1)
    
    reg = SGDRegressor(max_iter=1000, tol=1e-4)
    reg.fit(X1_train, y1_train)
    y_pred = reg.predict(X1_test)
    y_pred = y_pred.astype(int)
    print("Prediction error spearman for top", k, "features with SGD :", mean_absolute_error(y_true=y1_test, y_pred=y_pred))
    
    lin_reg = LinearRegression()
    lin_reg.fit(X1_train, y1_train)
    y_pred = lin_reg.predict(X1_test)
    y_pred = y_pred.astype(int)
    print("Prediction error spearman for top", k, "features with LinReg :", mean_absolute_error(y_true=y1_test, y_pred=y_pred))
    
    reg = SGDRegressor(max_iter=1000, tol=1e-4)
    reg.fit(X2_train, y2_train)
    y_pred = reg.predict(X2_test)
    y_pred = y_pred.astype(int)
    print("Prediction error pearson for top", k, "features with SGD :", mean_absolute_error(y_true=y2_test, y_pred=y_pred))
    
    lin_reg = LinearRegression()
    lin_reg.fit(X2_train, y2_train)
    y_pred = lin_reg.predict(X2_test)
    y_pred = y_pred.astype(int)
    print("Prediction error pearson for top", k, "features with LinReh", mean_absolute_error(y_true=y2_test, y_pred=y_pred), '\n')

Prediction error spearman for top 1 features with SGD : 268.293840808275
Prediction error spearman for top 1 features with LinReg : 263.4367308520332
Prediction error pearson for top 1 features with SGD : 258.1407071404969
Prediction error pearson for top 1 features with LinReh 252.6057306217269 

Prediction error spearman for top 2 features with SGD : 249.87802277028447
Prediction error spearman for top 2 features with LinReg : 265.5086915597745
Prediction error pearson for top 2 features with SGD : 241.3662020487248
Prediction error pearson for top 2 features with LinReh 247.2537725174482 

Prediction error spearman for top 3 features with SGD : 261.18100573763104
Prediction error spearman for top 3 features with LinReg : 249.2994382528763
Prediction error pearson for top 3 features with SGD : 249.51422892446953
Prediction error pearson for top 3 features with LinReh 244.917124775952 

Prediction error spearman for top 4 features with SGD : 218.06041535241872
Prediction error spearma

In [14]:
def write_prediction_file(eval_data, y_pred):
    with open("gbr_predictions.txt", 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["TweetID", "NoRetweets"])
        for index, prediction in enumerate(y_pred):
            writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])