In [1]:
import pandas as pd
import tensorflow.compat.v1 as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
"""
TwitterSEISMIC Dataset
  <relative_time_second>: relative post time of the tweet/retweet (in second)
  <number_of_followers>: number of followers of the user who tweets/retweets
"""
data = pd.read_csv("data.csv")
data

Unnamed: 0,relative_time_second,number_of_followers
0,0.0,33.0
1,84833.0,46828.0
2,84878.0,208.0
3,84883.0,37.0
4,84900.0,137.0
...,...,...
34784483,103297.0,110.0
34784484,108205.0,24.0
34784485,109345.0,151.0
34784486,158677.0,391.0


In [3]:
"""
TwitterSEISMIC Index
  <tweet_id>: id of the original tweet
  <post_time_day>: post time (UTC) of the original tweet (in day)
  <start_ind>: the first row in data.csv of this tweet
  <end_ind>: the last row in data.csv of this tweet 
"""
from datetime import datetime
index = pd.read_csv("index.csv")
index['post_date'] = pd.to_datetime(index['post_time_day'], unit='D', origin=pd.Timestamp('2011-10-07'))
index['total_retweets_over_15_days'] = index["end_ind"] - index["start_ind"]
index.shape

(166076, 6)

In [9]:
pd.options.display.float_format = lambda x : '{:.5f}'.format(x)
data.describe()

Unnamed: 0,relative_time_second,number_of_followers
count,34784488.0,34784488.0
mean,39158.58904,2774.20607
std,89030.8985,64880.14996
min,0.0,0.0
25%,334.0,50.0
50%,3209.0,120.0
75%,29312.0,286.0
max,604799.0,14755952.0


In [5]:
"""
Identify cascades in the network
    2% tweets were selected as outbreak tweets according to their final retweets and the minimum retweet
    number of outbreak tweets was selected as the outbreak threshold
    3131 outbreak tweets with the outbreak threshold was 1000 
"""
index['outbreak'] = index['total_retweets_over_15_days'] > 1000
total_tweets_s =  index # index.sort_values(by='total_retweets_over_15_days', ascending = False)
total_tweets_s.groupby('outbreak').count()

Unnamed: 0_level_0,tweet_id,post_time_day,start_ind,end_ind,post_date,total_retweets_over_15_days
outbreak,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,162945,162945,162945,162945,162945,162945
True,3131,3131,3131,3131,3131,3131


In [6]:

total_tweets_s.describe()

Unnamed: 0,tweet_id,post_time_day,start_ind,end_ind,total_retweets_over_15_days
count,166076.0,166076.0,166076.0,166076.0,166076.0
mean,1.250288e+17,8.085091,17457850.0,17458050.0,208.449216
std,1492892000000000.0,4.119661,10111260.0,10111260.0,396.576751
min,1.22206e+17,0.294514,1.0,175.0,49.0
25%,1.23759e+17,4.580842,8740974.0,8741077.0,70.0
50%,1.25e+17,8.006973,17283150.0,17283240.0,110.0
75%,1.26374e+17,11.797674,26146000.0,26146470.0,219.0
max,1.27535e+17,14.999873,34784410.0,34784490.0,33484.0


In [7]:
total_tweets_dates =  index
total_tweets_dates.groupby('post_time_day').count()

Unnamed: 0_level_0,tweet_id,start_ind,end_ind,post_date,total_retweets_over_15_days,outbreak
post_time_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.294514,1,1,1,1,1,1
0.295231,1,1,1,1,1,1
0.297766,1,1,1,1,1,1
0.298090,1,1,1,1,1,1
0.300127,1,1,1,1,1,1
...,...,...,...,...,...,...
14.999780,1,1,1,1,1,1
14.999792,1,1,1,1,1,1
14.999803,1,1,1,1,1,1
14.999861,1,1,1,1,1,1


In [None]:
start_stop = index[['start_ind', 'end_ind']].values.tolist()
start_stop.head()

In [8]:
index

Unnamed: 0,tweet_id,post_time_day,start_ind,end_ind,post_date,total_retweets_over_15_days,outbreak
0,1.224350e+17,0.926644,1,175,2011-10-07 22:14:22.000041600,174,False
1,1.224500e+17,0.968160,176,369,2011-10-07 23:14:08.999980800,193,False
2,1.224500e+17,0.969560,370,703,2011-10-07 23:16:09.999984000,333,False
3,1.224430e+17,0.949734,704,827,2011-10-07 22:47:36.999974400,123,False
4,1.224570e+17,0.987373,828,941,2011-10-07 23:41:48.999984000,113,False
...,...,...,...,...,...,...,...
166071,1.250010e+17,8.007164,34784039,34784103,2011-10-15 00:10:19.000012800,64,False
166072,1.250010e+17,8.007407,34784104,34784155,2011-10-15 00:10:39.999964800,51,False
166073,1.259250e+17,10.559387,34784156,34784320,2011-10-17 13:25:30.999648000,164,False
166074,1.255490e+17,9.519977,34784321,34784412,2011-10-16 12:28:46.000012800,91,False


In [None]:
import numpy as np

def merge_data(data, index):
    df = data
    df["tweet_id"] = np.nan
    df['post_date'] = np.nan
    df['outbreak'] = np.nan
    
    temp = index[['post_date', 'outbreak', 'tweet_id']]
    start_stop = index[['start_ind', 'end_ind']].values.tolist()
    count = 0
    
    for start, stop in start_stop:
        df.loc[(start-1):(stop-1),['post_date', 'outbreak', 'tweet_id']] = temp.loc[count].values
        
        # print(start-1, " ", stop-1)
        # print( df.loc[(start-1):(stop-1),['post_date', 'outbreak', 'tweet_id']] )
        # print( temp.loc[count].shape)
        #  df.loc[(start-1):(stop-1),'outbreak'] = index['outbreak'].loc[count]
        #  df.loc[(start-1):(stop-1),'tweet_id'] = index['tweet_id'].loc[count]
        
        count += 1
        
    return df

data_next = merge_data(data, index)
data_next['post_date'].unique()

In [None]:
data_next.to_csv("merged_data.csv")

In [None]:
data_next.groupby('post_date').count()

In [None]:
# """
#     Combine the data sets
# """
# def merge(data):
#     idx = index.to_dict('records')
#     flatten = lambda t: [item for sublist in t for item in sublist]
#     tweet_id = [( 1 + row['end_ind'] - row['start_ind']) * [str(row['tweet_id'])] for row in idx]
#     outbreak = [( 1 + row['end_ind'] - row['start_ind']) * [str(row['outbreak'])] for row in idx]
#     post_date = [( 1 + row['end_ind'] - row['start_ind']) * [str(row['post_date'].strftime('%Y-%m-%d'))] for row in idx]
#     total_retweets_over_15_days = [( 1 + row['end_ind'] - row['start_ind']) * [str(row['total_retweets_over_15_days'])] for row in idx]

#     data['tweet_id'] = flatten(tweet_id)
#     data['outbreak'] = flatten(outbreak)
#     data['post_date'] = flatten(post_date)
#     data['total_retweets_over_15_days'] = flatten(total_retweets_over_15_days)

#     """
#         Convert data types
#     """
#     data['tweet_id'] = pd.to_numeric(data['tweet_id'])
#     data['total_retweets_over_15_days'] = pd.to_numeric(data['total_retweets_over_15_days'])
#     data['post_date'] = pd.to_datetime(data['post_date'])
#     data['outbreak'] = data['outbreak'].apply(lambda x: True if x == 'True' else False)
#     return data

# data_index = merge(data)
# data_index

In [None]:
#data.to_csv("check_outbreaks.csv")
total_tweets_grouped =  data
total_tweets_grouped.groupby('outbreak').count()

In [None]:
pd.options.display.float_format = lambda x : '{:.5f}'.format(x)
vb = data[data['tweet_id'] == 125000000000000000]
vb['post_date'].unique()

In [None]:
"""
 Break into time windows
"""

import warnings
warnings.filterwarnings('ignore')

def gen_groups(window_length_hrs = 1, window_size_secs = 10):
    gb = data.groupby('tweet_id')
    digest = pd.DataFrame()
    for name, group in gb:
        group = group[group['relative_time_second'] < 3600 * window_length_hrs]
        group['window'] = group['relative_time_second'].apply(lambda x: x//window_size_secs)
        # print("Name ", name)
        for n, sg in group.groupby('window'):
            nf = sg['number_of_followers']
            tweet_window = {'tweet_id':name, 'median_followers' : nf.median(), 
                            'count_retweets': nf.count(), 'sum_followers': nf.sum(), 
                            'outbreak': sg['outbreak'].max(),
                            'window': n,
                            'post_date': sg['post_date'].max()}
            yield tweet_window

g = gen_groups()
df = pd.DataFrame(list(g))
df.head()


In [None]:
#HELLO

In [None]:
"""
    Save the grouped tweets to a new file
    uncomment to resave
"""
df.to_csv('grouped_cascades_tagged_2020_v2.csv')

In [None]:
"""
    START HERE AFTER PREPROCESSING
"""
grouped_cascades = pd.read_csv("grouped_cascades_tagged_2020_v2.csv")
grouped_cascades['window'] =grouped_cascades['window'].astype('int')
grouped_cascades.drop(['Unnamed: 0'], 1, inplace=True)

total_tweets_group =  grouped_cascades
total_tweets_group.groupby('outbreak').count()

In [None]:
grouped_cascades['post_date'].unique()

In [None]:
vb = grouped_cascades[grouped_cascades['tweet_id'] == 125000000000000000]
vb[vb['window']>=310]


In [None]:
grouped_cascades.head()

In [None]:
pd.options.display.float_format = lambda x : '{:.5f}'.format(x)
vb = grouped_cascades[grouped_cascades['tweet_id'] == 125000000000000000]
vb[vb['window']>=310]

In [None]:
"""
    Backfill tweets that dont have tweets for certain windows
"""
def backfill_missing(data):
    vals_wind = [x for x in range(360)]
    unq_tweet = data.tweet_id.unique()
    timestamps = [vals_wind for y in unq_tweet ]
    df = pd.DataFrame({'tweet_index':unq_tweet, 'timestamps':timestamps}).explode(column='timestamps')
    cascades_merged = pd.merge(df, data,  how='left', left_on=['tweet_index','timestamps'], right_on = ['tweet_id','window'])

    # Fil with zeros if no data for window
    cascades_merged[['median_followers', 'count_retweets', 'sum_followers']] = cascades_merged[['median_followers', 'count_retweets', 'sum_followers']].fillna(value=0)
    cascades_merged['window'] = pd.to_numeric(cascades_merged['timestamps'])

    #Drop duplicate columns
    cascades_merged.drop(['timestamps', 'tweet_id'], 1, inplace=True)

    # Fill down with data for outbreak and postdate
    cascades_merged['post_date'] = cascades_merged['post_date'].ffill(axis = 0)
    cascades_merged['outbreak'] = cascades_merged['outbreak'].ffill(axis = 0) 

    cascades_merged['post_date'] = pd.to_datetime(cascades_merged['post_date'])
    return cascades_merged

cascades_merged = backfill_missing(grouped_cascades)
cascades_merged

In [None]:
"""
Training vs Testing Set
    The 15 days were divided into two parts, the first 7 days were used for training and the next 8 days were used for test.
    94,254 tweets in the test set.
    Test: 2856 unique tweets (55%)  Train: 2377 unique tweets (45%)
"""
pd.options.display.float_format = lambda x : '{:.5f}'.format(x)
def test_train_split(data):
    train = data[data.post_date < '2011-10-14']
    test = data[data.post_date > '2011-10-14']
    
#     row = train.loc[977351]
#     print(row)
#     train.drop(index=977351, inplace=True)
    count_test = len(test.tweet_index.unique())
    count_train = len(train.tweet_index.unique())
    #total = len(df.tweet_index.unique())
    total = len(data.tweet_index.unique())
    print("Unique cascades ", total)
    print("tot ", (count_test + count_train))
    print("Test: {} unique tweets ({:.0%})  Train: {} unique tweets ({:.0%})".format(count_test, float(count_test/total), count_train, float(count_train/total)))
    return train, test

print("Total tweet and retweets: {}".format(grouped_cascades.shape[0]))
train, test = test_train_split(cascades_merged)


In [None]:
train.groupby('tweet_index').describe()

In [None]:
test.groupby('tweet_index').describe()

In [None]:
test.groupby('tweet_index')['window'].count().unique()

In [None]:
vb = test[test['tweet_index'] == 125000000000000000]
vb[vb['window']>=310]

In [None]:
#train.iloc[-5:-1]
test.iloc[-362]

In [None]:
train.iloc[-1]

In [None]:
"""
    Create Numpy array for each tweet
"""
import numpy as np
def get_numpy_arrays(data, x=True):
    gb = data.groupby('tweet_index')
#     values = np.empty([360, 4])
    values  = []     
#     print(values.shape)
    for name, group in gb:
        group.drop(['tweet_index', 'post_date'], 1, inplace=True)
        if x:
            group.drop(['outbreak'], 1, inplace=True)
#             print(group.values.shape)

            d = group.values.tolist()
            
        else:
            group = group[['outbreak']].max()
            d = group[0]
        values.append(d)
#         values = np.concatenate((values,d))
           
    return values

# median_followers 	count_retweets 	sum_followers, window
train_input_x = np.stack(np.array(get_numpy_arrays(train, True)))
test_input_x = np.stack(np.array(np.array(get_numpy_arrays(test, True))))
# train_input_y = np.array(get_numpy_arrays(train, False))
# test_input_y = np.array(get_numpy_arrays(test, False))
train_input_x[0]

In [None]:
train_input_y[0]

In [None]:
np.stack(np.array(train_input_x)).shape

In [None]:
# sorted(train_input_x, key=len)[0]
# gb = train.groupby('tweet_index')
# for name, group in gb:
    
#     group.drop(['tweet_index', 'post_date'], 1, inplace=True)
#     group.drop(['outbreak'], 1, inplace=True)
#     d = group.values
#     break
# d
# d.tolist()[0]

In [None]:

# median_followers 	count_retweets 	sum_followers, window
train_input_x = np.array(get_numpy_arrays(train, True))
train_input_y = np.array(get_numpy_arrays(train, False))
test_input_x = np.array(get_numpy_arrays(test, True))
test_input_y = get_numpy_arrays(test, False)
train_input_x[:3]

In [None]:
train_input_y.shape

In [None]:
train_input_x = train_input_x.reshape((train_input_x.shape[0], 360, 1))

In [None]:
print(train_input_x.shape, " " ,  train_input_x[1].shape)  #360 windows and 4 columns (wiht median_followers count_retweets sum_followers and window number)
print(train_input_y.shape, " " ,  train_input_y[1])  #1 output for cascade
# (2377,)   (360, 4)
# (2377,)   True

In [None]:
train_input_x

In [None]:
# TEST WHAT IS GOING ON
mnist = keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()


In [None]:
print(x_train.shape, " " ,  x_train[0].shape)  #28 rows and 28 columns
print(y_train.shape, " " ,  y_train[0])

In [None]:
y_train.shape

In [None]:
x_train.shape

In [None]:
x_train[:1]

In [None]:
# """
#     Clean up training set
# """
# train_X = train.drop(['post_date'], axis=1)
# train_X.head()
# train_X = train.drop(['outbreak'], axis=1)
# train_Y = train['outbreak']
# train_X


In [None]:
batch_size = 64
input_dim = 4
units = 64
output_size = 2  # labels are from 0 to 9

# Build the RNN model
def build_model():
    # Wrapping a LSTMCell in a RNN layer
    lstm_layer = keras.layers.RNN(
        keras.layers.LSTMCell(units), input_shape=(None, input_dim)
    )
    model = keras.models.Sequential(
        [
            lstm_layer,
            keras.layers.BatchNormalization(),
            keras.layers.Dense(output_size),
        ]
    )
    return model

model = build_model()
model.summary()

In [None]:
train_input_y.reshape((2377, 0, 1)).shape


In [None]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer="sgd",
    metrics=["accuracy"],
)

model.fit(
    train_input_x, train_input_y, validation_data=(test_input_x, test_input_y), batch_size=batch_size, epochs=1
)


In [None]:
#Based on dengue fever group
model = keras.Sequential()

# Add a LSTM layer with 128 internal units.
model.add(layers.LSTM(128, input_shape=(train.shape[1],
               train.shape[2])))
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(1))
model.compile(loss='mae', optimizer='adam')  # mean absolute error to evaluate its performance
# model.summary()

In [None]:
history = model.fit(train, train_y, epochs=epoch_count, batch_size=72, 
                        validation_data=(test_X, test_y), verbose=0, shuffle=False)

In [None]:
"""
    Model
"""
model = models.Sequential() 
model.add(LSTM(103, input_shape=(train.shape[1],
               train.shape[2])))
model.add(Dense(50, activation='relu'))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')

In [None]:

#print("Test ", count_train, " Test ", count_test, " :")
#test.s

# df =  data.sort_values(by='outbreak')
# df

In [None]:
df = data.sort_values(by='post_date', ascending = False)
df.head()

In [None]:
#Training
#The 15 days were divided into two parts, the first 7 days were used for training and the next 8 days were used for test

In [None]:
# B. Retweet count feature extractor
# B1. can represent a cascade as a sequence of time windows after monitoring each tweet up to Timeo
# Each of these time windows has zero or more number of retweet counts. 
# b2. use one-hot encoder to represent the total number of retweets in each time window to a vector
# B3. we have a sequence of vectors for a sequence of time windows corresponding to each tweet.
# B4. fed through single GRU units to extract latent features
# B5. deplye attention layer
# viral if retweettotal > N, where N is some predefined threshold


In [None]:
 """
Our model is going to be a Long Short-Term Memory (LSTM) RNN. 
The recurrent layer is followed by a few dense layers, 
activated with Scaled Exponential Linear Unit (SELU) and 
regularized by batch normalization.
"""

model = keras.Sequential()
model.add(layers.Embedding(input_dim=1000, output_dim=64))

# The output of GRU will be a 3D tensor of shape (batch_size, timesteps, 256)
model.add(layers.GRU(256, return_sequences=True))

# The output of SimpleRNN will be a 2D tensor of shape (batch_size, 128)
model.add(layers.SimpleRNN(128))

model.add(layers.Dense(10))

model.summary()


In [None]:
# https://www.guru99.com/rnn-tutorial.html
import numpy as np
n_inputs = 4
n_neurons = 6
n_timesteps = 2

#The data is a sequence of a number from 0 to 9 and divided into three batches of data.
## Data 
X_batch = np.array([
        [[0, 1, 2, 5], [9, 8, 7, 4]], # Batch 1
        [[3, 4, 5, 2], [0, 0, 0, 0]], # Batch 2
        [[6, 7, 8, 5], [6, 5, 4, 2]], # Batch 3
    ])

In [None]:
# Placeholder
#     None: Unknown and will take the size of the batch
#     n_timesteps: Number of time the network will send the output back to the neuron
#     n_inputs: Number of input per batch

X = tf.placeholder(tf.float32, [None, n_timesteps, n_inputs])

In [None]:
# The higher the loss function, the dumber the model is. 
"""
Traditional NN produces the output by multiplying the input with the weight and the activation function. 
With an RNN, this output is sent back to itself number of time. We call timestep the amount of time 
the output becomes the input of the next matrice multiplication. 
"""

basic_cell = tf.keras.layers.SimpleRNNCell(units=n_neurons)
#basic_cell = tf.compat.v1.nn.rnn_cell.BasicRNNCell(num_units=n_neurons)

In [None]:
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)
#outputs, states = tf.keras.layers.RNN(basic_cell, X, dtype=tf.float32)

In [None]:
init = tf.global_variables_initializer()
with tf.Session() as sess:
    init.run()
    outputs_val = outputs.eval(feed_dict={X: X_batch})
print(states.eval(feed_dict={X: X_batch}))