In [1]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from datetime import datetime

In [2]:
tweets = pd.read_csv('data/train.csv')
eval_ds = pd.read_csv('data/evaluation.csv')

Cleaning the datas

In [3]:
def day(t):
    dt = datetime.fromtimestamp(t / 1000)
    return dt.day

def clean(df):
    X = df.copy()
    X.timestamp = X.timestamp.apply(day)
    X.user_verified = X.user_verified.fillna(False).astype(int)
    X.user_statuses_count = X.user_statuses_count.fillna(0)
    X.user_followers_count = X.user_followers_count.fillna(0)
    X.user_friends_count = X.user_friends_count.fillna(0)
    X.user_mentions = X.user_mentions.fillna('').apply(lambda x: int(bool(x)))
    X.hashtags = X.hashtags.fillna('').astype(bool).astype(int)
    X.text = X.text.fillna('')
    X.urls = X.urls.fillna('')
    X.urls = X.urls.apply(lambda x: int(bool(x))) + X.text.apply(lambda x: int(bool('http' in x)))
    X.urls = X.urls.apply(lambda x: 1 if x==2 else x)
    X.drop('id', axis=1, inplace=True)
    return X

Insert a column for categories of words that get a high number of retweets

In [304]:
def insert_or(words, title, df):
    if title not in df.columns:
        df.insert(0, title, df.text.apply(lambda x: int(bool([1 for w in words if w in x.lower()]))))
        
def insert_cat(df):
    X = df.copy()
    trump = ['president', 'trump', 'donald', 'dt']
    insert_or(trump, 'trump', X)
    virus = ['virus', 'corona', 'covid', 'sick', 'flu']
    insert_or(virus, 'virus', X)
    doctor = ['doctor', 'nurse', 'kid', 'hospital', 'administration', 'medical', 'save']
    insert_or(doctor, 'doctor', X)
    america = ['america', 'democrat']
    insert_or(america, 'america', X)
    china = ['chin', 'wuhan']
    insert_or(china, 'china', X)
    handwash = ['hand', 'wash']
    insert_or(handwash, 'handwash', X)
    return X

def insert_big_cat(df):
    X = df.copy()
    words = ['president', 'trump', 'donald', 'dt', 'virus', 'corona', 'covid', 'sick', 'flu', \
             'doctor', 'nurse', 'kid', 'hospital', 'administration', 'medical', 'save', 'america', \
             'democrat', 'chin', 'wuhan', 'hand', 'wash']
    insert_or(words, 'words', X)
    return X

def transform(df):
    X = df.copy()
    X = clean(X)
    X = insert_cat(X)
    if 'text' in X.columns:
        X.insert(0, 'words_count', X.text.apply(lambda x: min(140, len(x.split(' ')))))
        X.drop('text', axis=1, inplace=True)
    return X

In [305]:
X = tweets.copy().drop('retweet_count', axis=1)
y = tweets.retweet_count.copy()
Z = eval_ds.copy()
X = transform(X)
Z = transform(Z)
X.head()

Unnamed: 0,words_count,handwash,china,america,doctor,virus,trump,timestamp,user_verified,user_statuses_count,user_followers_count,user_friends_count,user_mentions,urls,hashtags
0,4,0,0,0,0,0,0,5,0,68460,1101,1226,0,0,0
1,14,0,0,0,0,0,0,3,0,309,51,202,0,0,0
2,21,0,0,0,0,0,1,5,0,3241,1675,2325,0,0,0
3,16,0,0,0,0,0,0,2,0,32327,667,304,0,0,0
4,15,0,0,0,0,0,0,4,0,581,42,127,0,0,0


In [306]:
#X_train, X_test, y_train, y_test = train_test_split(X.copy(), y.copy(), test_size=0.3, random_state=3)
X_train = X.copy()
X_test = Z.copy() # for submissions
y_train = y.copy()

X_train_day = X_train.timestamp
X_train.drop('timestamp', axis=1, inplace=True)
X_test_day = X_test.timestamp
X_test.drop('timestamp', axis=1, inplace=True)

#X_train.drop('user_mentions', axis=1, inplace=True)
#X_test.drop('user_mentions', axis=1, inplace=True)

t1 = 10
t2 = 600
t3 = 3000
y_train_class = y_train.apply(lambda x: 0 if x <= t1 else 1 if x <= t2 else 2 if x <= t3 else 3)

In [307]:
%%time
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=2, n_estimators=300)
rfc.fit(X_train, y_train_class)
X_test_class = rfc.predict(X_test)

Wall time: 5min 41s


In [308]:
upper_bound = 20000
lower_bound = 1
X_train_reg = X_train[y_train >= lower_bound]
X_train_reg = X_train_reg[y_train[y_train >= lower_bound] < upper_bound]
y_train_reg = y_train[y_train >= lower_bound]
y_train_reg = y_train_reg[y_train[y_train >= lower_bound] < upper_bound]

In [309]:
%%time
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=2, n_estimators=300)
rfr.fit(X_train_reg, y_train_reg)
y_pred = rfr.predict(X_test)
tmp = y_pred.copy()

Wall time: 6min 34s


In [310]:
y_pred = tmp.copy()

In [311]:
y_pred[X_test_class == 0] = 0

In [312]:
y_pred[X_test_class == 1] = (tmp[X_test_class == 1]**0.94) / 4.1

In [313]:
y_pred[X_test_class == 2] = (tmp[X_test_class == 2]**1.16) / 6.2

In [314]:
y_pred[X_test_class == 3] = (tmp[X_test_class == 3]**1.26) / 10.1

In [303]:
mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
print("Prediction error on true datas :", mae)
zero = mean_absolute_error(y_true=y_test, y_pred=y_pred*0)
print("Constant zero prediction :", zero)
print(zero - mae) # 8.54137

Prediction error on true datas : 142.6138791265264
Constant zero prediction : 151.46727647771536
8.85339735118896


8.573968836720809 : clusters : 10 - 600 - 3000, upper lower : 20 000 - 1, div by 6.1 - 1.8 - 1.07 - n_esti = 120

In [296]:
zero = mean_absolute_error(y_true=y_test, y_pred=y_pred*0)
score = 0
best_i, best_j = 0, 0
for i in np.linspace(0.1, 2.1, 101):
    for j in np.linspace(0.1, 10.1, 101):
        y_pred = tmp.copy()
        y_pred[X_test_class == 0] = 0
        y_pred[X_test_class == 1] = (tmp[X_test_class == 1]**0.94) / 4.1
        y_pred[X_test_class == 2] = tmp[X_test_class == 2]**1.16 / 6.2
        y_pred[X_test_class == 3] = (tmp[X_test_class == 3]**i) / j
        mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
        new_score = zero - mae
        if new_score > score:
            score = new_score
            best_i = i
            best_j = j

In [315]:
def write_prediction_file(eval_data, y_pred):
    with open("random_forests.txt", 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["TweetID", "NoRetweets"])
        for index, prediction in enumerate(y_pred):
            writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])

In [316]:
write_prediction_file(eval_ds, y_pred)