# Simon-NN

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor # gradient boosting regression
from verstack.stratified_continuous_split import scsplit
from sklearn import preprocessing

import csv
import numpy as np

import matplotlib.pylab as plt
import os # to set the right working directory
from tqdm import tqdm # to use convenient progress bars

from datetime import datetime

## Import pre-processed data

In [4]:
train_data = pd.read_pickle("../Preprocessing/Data/train_processed.pkl")
eval_data = pd.read_pickle("../Preprocessing/Data/eval_processed.pkl")

## Scaling features

In [6]:
# Scaling user_followers_count
train_data['user_followers_count'] = preprocessing.scale(train_data['user_followers_count'])
eval_data['user_followers_count'] = preprocessing.scale(eval_data['user_followers_count'])
    
# Scaling user_friends_count  
train_data['user_friends_count'] = preprocessing.scale(train_data['user_friends_count'])
eval_data['user_friends_count'] = preprocessing.scale(eval_data['user_friends_count'])
  
# Scaling user_statuses_count     
train_data['user_statuses_count'] = preprocessing.scale(train_data['user_statuses_count'])
eval_data['user_statuses_count'] = preprocessing.scale(eval_data['user_statuses_count'])
    
# Scaling text_length
train_data['text_length'] = preprocessing.scale(train_data['text_length'])
eval_data['text_length'] = preprocessing.scale(eval_data['text_length'])

In [8]:
X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweet_count'], stratify=train_data['retweet_count'], 
                                           train_size=0.7, test_size=0.3)

## Selecting relevant features

In [9]:
X_train = X_train[['user_statuses_count', 'hashtag_count', 'user_mentions_count', 'user_followers_count', 'user_friends_count', 'user_verified', 'text_length', 'hour', 'week_day', 'day']]
X_test = X_test[['user_statuses_count', 'hashtag_count', 'user_mentions_count', 'user_followers_count', 'user_friends_count', 'user_verified', 'text_length', 'hour', 'week_day', 'day']]

## Defining and training model on X_train

In [10]:
def build_simon():
    model = tf.keras.Sequential([
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(loss='mae',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
    return model

simon = build_simon()

EPOCHS = 100

# The patience parameter is number of epochs to check for improvement
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

early_history = simon.fit(X_train, y_train, 
                    epochs=EPOCHS, validation_split = 0.2, verbose=2, 
                    callbacks=[early_stop], batch_size = 1000)

loss, mae, mse = simon.evaluate(X_test, y_test, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} PC_total".format(mae))

Epoch 1/100


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

373/373 - 1s - loss: 144.5935 - mae: 144.5935 - mse: 7907016.5000 - val_loss: 149.8457 - val_mae: 149.8457 - val_mse: 8758844.0000
Epoch 2/100
373/373 - 0s - loss: 143.7017 - mae: 143.7017 - mse: 7881909.0000 - val_loss: 149.2675 - val_mae: 149.2675 - val_mse: 8743669.0000
Epoch 3/100
373/373 - 0s - loss: 143.0706 - mae: 143.0706 - mse: 7861635.0000 - val_loss: 148.6145 - val_mae: 148.6145 - val_mse: 8714088.0000
Epoch 4/100
373/373 - 0s - loss: 142.4017 - mae: 142.4017 - mse: 7840991.5000 - val_loss: 148.1813 - val_mae: 148.1813 - val_mse: 8693237.0000
Epoch 5/100
373/373 - 0s - loss: 141.7308 - mae: 141.7308 - mse: 7816236.5000 - val_loss: 147.5374 - val_mae: 147.5374 - val_ms

373/373 - 1s - loss: 139.1365 - mae: 139.1365 - mse: 7692858.5000 - val_loss: 145.0128 - val_mae: 145.0128 - val_mse: 8516425.0000
Epoch 54/100
373/373 - 1s - loss: 139.0830 - mae: 139.0830 - mse: 7693213.5000 - val_loss: 145.0228 - val_mae: 145.0228 - val_mse: 8534826.0000
Epoch 55/100
373/373 - 1s - loss: 139.0788 - mae: 139.0788 - mse: 7691302.0000 - val_loss: 145.0635 - val_mae: 145.0635 - val_mse: 8506381.0000
Epoch 56/100
373/373 - 1s - loss: 139.0619 - mae: 139.0619 - mse: 7688167.5000 - val_loss: 144.9254 - val_mae: 144.9254 - val_mse: 8509805.0000
Epoch 57/100
373/373 - 1s - loss: 139.0565 - mae: 139.0565 - mse: 7687860.0000 - val_loss: 144.8907 - val_mae: 144.8907 - val_mse: 8509905.0000
Epoch 58/100
373/373 - 1s - loss: 139.0062 - mae: 139.0062 - mse: 7691004.5000 - val_loss: 144.9101 - val_mae: 144.9101 - val_mse: 8498605.0000
Epoch 59/100
373/373 - 1s - loss: 138.9944 - mae: 138.9944 - mse: 7689881.5000 - val_loss: 144.8981 - val_mae: 144.8981 - val_mse: 8489138.0000
Epoch

# Training Simon on whole train_data

In [14]:
train_data_NN = train_data[['user_statuses_count', 'hashtag_count', 'user_mentions_count', 'user_followers_count', 'user_friends_count', 'user_verified', 'text_length', 'hour', 'week_day', 'day']]
y_train_data_NN = train_data['retweet_count']
eval_data_NN = eval_data[['user_statuses_count', 'hashtag_count', 'user_mentions_count', 'user_followers_count', 'user_friends_count', 'user_verified', 'text_length', 'hour', 'week_day', 'day']]

In [15]:
def build_simon():
    model = tf.keras.Sequential([
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(loss='mae',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
    return model

simon = build_simon()

EPOCHS = 100

# The patience parameter is number of epochs to check for improvement
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

early_history = simon.fit(train_data_NN, y_train_data_NN, 
                    epochs=EPOCHS, validation_split = 0.2, verbose=2, 
                    callbacks=[early_stop], batch_size = 1000)

Epoch 1/100


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

533/533 - 1s - loss: 143.5820 - mae: 143.5820 - mse: 6972210.0000 - val_loss: 156.3913 - val_mae: 156.3913 - val_mse: 16270606.0000
Epoch 2/100
533/533 - 1s - loss: 142.4876 - mae: 142.4876 - mse: 6941578.5000 - val_loss: 155.3657 - val_mae: 155.3657 - val_mse: 16232848.0000
Epoch 3/100
533/533 - 1s - loss: 141.4897 - mae: 141.4897 - mse: 6907235.0000 - val_loss: 154.3292 - val_mae: 154.3292 - val_mse: 16206430.0000
Epoch 4/100
533/533 - 1s - loss: 140.5910 - mae: 140.5910 - mse: 6875303.0000 - val_loss: 153.4673 - val_mae: 153.4673 - val_mse: 16125968.0000
Epoch 5/100
533/533 - 1s - loss: 140.0446 - mae: 140.0446 - mse: 6841524.0000 - val_loss: 153.0930 - val_mae: 153.0930 - va

Epoch 53/100
533/533 - 1s - loss: 138.4366 - mae: 138.4366 - mse: 6775774.0000 - val_loss: 151.9141 - val_mae: 151.9141 - val_mse: 16052038.0000
Epoch 54/100
533/533 - 1s - loss: 138.4596 - mae: 138.4596 - mse: 6774290.5000 - val_loss: 151.9391 - val_mae: 151.9391 - val_mse: 16055634.0000
Epoch 55/100
533/533 - 1s - loss: 138.4153 - mae: 138.4153 - mse: 6774765.0000 - val_loss: 151.9732 - val_mae: 151.9732 - val_mse: 16072205.0000
Epoch 56/100
533/533 - 1s - loss: 138.3929 - mae: 138.3929 - mse: 6769508.0000 - val_loss: 151.9838 - val_mae: 151.9838 - val_mse: 16065477.0000
Epoch 57/100
533/533 - 1s - loss: 138.4044 - mae: 138.4044 - mse: 6770286.0000 - val_loss: 152.0426 - val_mae: 152.0426 - val_mse: 16039242.0000
Epoch 58/100
533/533 - 1s - loss: 138.3874 - mae: 138.3874 - mse: 6771039.5000 - val_loss: 151.8826 - val_mae: 151.8826 - val_mse: 16059638.0000
Epoch 59/100
533/533 - 1s - loss: 138.3820 - mae: 138.3820 - mse: 6773200.0000 - val_loss: 152.0148 - val_mae: 152.0148 - val_mse:

## Predicting #retweets for evaluation set

In [17]:
y_pred = simon.predict(eval_data_NN)

## Exporting to .txt

In [206]:
#with open("FILE_NAME.txt", 'w') as f:
#    writer = csv.writer(f)
#    writer.writerow(["TweetID", "NoRetweets"])
#    for index, prediction in enumerate(y_pred):
#        writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])