In [2]:
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

### 1. Import Pre-processed data

In [3]:
n_features = 20000
n_components = 800
n_clusters = 7

In [4]:
train_title = f'train_text_{n_features:0}_{n_components:0}_{n_clusters:0}.pkl'
train_title

'train_text_20000_800_7.pkl'

In [5]:
eval_title = f'eval_text_{n_features:0}_{n_components:0}_{n_clusters:0}.pkl'
eval_title

'eval_text_20000_800_7.pkl'

In [7]:
clusters_train = pd.read_pickle("Data/"+train_title)
clusters_eval = pd.read_pickle("Data/"+eval_title)

In [10]:
clusterised_train_data = pd.read_pickle("../Preprocessing/Data/train_processed.pkl")
clusterised_eval_data = pd.read_pickle("../Preprocessing/Data/eval_processed.pkl")

In [12]:
clusterised_train_data['text_cluster'] = clusters_train
clusterised_eval_data['text_cluster'] = clusters_eval

### 2. Clusterised training and test

In [35]:
for i in range(n_clusters):
    X_i = clusterised_train_data[clusterised_train_data['text_cluster'] == i]
    #X_i = X_i[X_i['url_count']>0]
    
    X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X_i, X_i['retweet_count'], train_size=0.7, test_size=0.3)
    
    # one only keeps the most relevant feature for regression
    X_train_i = X_train_i[['user_followers_count', 'user_friends_count', 'hashtag_count', 'text_length']]
    X_test_i = X_test_i[['user_followers_count', 'user_friends_count', 'hashtag_count', 'text_length']]
    
    # train regressor
    reg = GradientBoostingRegressor()
    reg.fit(X_train_i, y_train_i)
    
    # make a prediction    
    y_pred_i = reg.predict(X_test_i)
    print("Prediction error for cluster",i,":", mean_absolute_error(y_true=y_test_i, y_pred=y_pred_i))   

Prediction error for cluster 0 : 202.5680549542123
Prediction error for cluster 1 : 169.31596950713924
Prediction error for cluster 2 : 233.4264398566784
Prediction error for cluster 3 : 247.9731750414504
Prediction error for cluster 4 : 27.202168951233038
Prediction error for cluster 5 : 181.03776218820607


### 3. Train on whole data set

In [37]:
models = []
for i in range(n_clusters):
    X_i = clusterised_train_data[clusterised_train_data['text_cluster'] == i]
    y_i = X_i['retweet_count']
    
    # one only keeps the most relevant feature for regression
    X_i = X_i[['user_followers_count', 'user_friends_count', 'hashtag_count', 'text_length']]
    
    
    # train regressor
    models.append(GradientBoostingRegressor())
    models[i].fit(X_i, y_i)