# RNN Model

## Install Packages

In [None]:
# Install your required packages here
!pip install pandas numpy matplotlib sklearn fsspec gcsfs tqdm

## Imports & Constants

In [None]:
#Import the libraries for RNN LSTM
import numpy as np
import pandas as pd
import sklearn
from tqdm import tqdm

from google.cloud import storage
from glob import glob
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.utils.vis_utils import plot_model
import math

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Commented out IPython magic to ensure Python compatibility.
%env GOOGLE_APPLICATION_CREDENTIALS=/content/drive/My Drive/CS/AI/Credentials/ai-project-2020-f4dfbc25326c.json

In [None]:
bucket_name = "ai-project-2020-spotify"
client = storage.Client()
bucket = client.get_bucket(bucket_name)

## Import Session Logs

In [None]:
logs = pd.read_csv(f"gs://{bucket_name}/training_set/log_0_20180715_000000000000.csv.gz")

logs.drop(columns=['date'], inplace=True)

track_num_columns_old = ['duration', 'release_year', 'us_popularity_estimate',
                     'acousticness', 'beat_strength', 'bounciness', 'danceability',
                     'dyn_range_mean', 'energy', 'flatness', 'instrumentalness', 'key',
                     'liveness', 'loudness', 'mechanism', 'organism', 'speechiness',
                     'tempo', 'time_signature', 'valence', 'acoustic_vector_0',
                     'acoustic_vector_1', 'acoustic_vector_2', 'acoustic_vector_3',
                     'acoustic_vector_4', 'acoustic_vector_5', 'acoustic_vector_6',
                     'acoustic_vector_7']
track_num_columns = track_num_columns_old + ["context_switch", "no_pause_before_play",
                                             "short_pause_before_play", "long_pause_before_play",
                                             'hist_user_behavior_n_seekfwd', 'hist_user_behavior_n_seekback',
                                            'hist_user_behavior_is_shuffle', "premium"]


## Import Track Features

In [None]:
track_features_1 = pd.read_csv('/content/drive/My Drive/CS/AI/Data/track_features/tf_000000000000.csv').set_index('track_id')
track_features_2 = pd.read_csv('/content/drive/My Drive/CS/AI/Data/track_features/tf_000000000001.csv').set_index('track_id')
track_features = track_features_1.append(track_features_2)

In [None]:
track_features.head()

## Utility Functions

In [None]:
def ave_pre(submission, groundtruth):
    """ Calculate average accuracy (which is the same as average precision in this context) """
    s = 0.0
    t = 0.0
    c = 1.0
    for x, y in zip(submission, groundtruth):
        if x != 0 and x != 1:
            raise ValueError()
        if x == y:
            s += 1.0
            t += s / c
        c += 1
    return t / len(groundtruth)

def evaluate(submission, groundtruth):
    """ Calculate metrics for prediction and ground thruth lists (source: starter kit) """
    ap_sum = 0.0
    first_pred_acc_sum = 0.0
    counter = 0
    for sub, tru in zip(submission, groundtruth):
        if len(sub) != len(tru):
            raise Exception('Line {} should contain {} predictions, but instead contains '
                            '{}'.format(counter + 1, len(tru), len(sub)))
        try:
            ap_sum += ave_pre(sub, tru)
        except ValueError as e:
            raise ValueError('Invalid prediction in line {}, should be 0 or 1'.format(counter))
        first_pred_acc_sum += sub[0] == tru[0]
        counter += 1
    ap = ap_sum / counter
    first_pred_acc = first_pred_acc_sum / counter
    return ap, first_pred_acc

def normalize(df,feature_name):
    result = df.copy()
    for name in feature_name:
        max_value = df[name].max()
        min_value = df[name].min()
        result[name] = (df[name] - min_value) / (max_value - min_value)
    return result

def categorical_to_dummies(df, categorical_cols):
    """ Create dummies (one hot encoding) for each categorical variables """
    dummies = pd.get_dummies(df[categorical_cols], prefix=categorical_cols)
    return df.drop(columns=categorical_cols).join(dummies)

def split_sessions(data, perc_in=0.6):
    """ Split interactions into train and test sessions. """
    sessions = data['session_id'].unique()
    amt_in = int(perc_in * len(sessions))
    sessions_in = np.random.choice(sessions, amt_in, replace=False)
    sessions_out = np.array(list(set(sessions) - set(sessions_in)))
    indexed_data = data.set_index('session_id')
    data_in = indexed_data.loc[sessions_in]
    data_out = indexed_data.loc[sessions_out]
    return data_in, data_out

## Preprocessing

In [None]:
logs = categorical_to_dummies(logs, ['context_type', 'hist_user_behavior_reason_start', 'hist_user_behavior_reason_end'])

In [None]:
track_num_columns += list(logs.columns[18:])

track_features = categorical_to_dummies(track_features, ['mode'])

track_num_columns += ['mode_major', 'mode_minor']

In [None]:
# Join track features and logs
data = logs.join(track_features, on='track_id_clean', how='left')
data['session_id'].nunique()

In [None]:
# Convert booleans to ints
data['premium'] = data['premium']*1
data['hist_user_behavior_is_shuffle'] = data['hist_user_behavior_is_shuffle']*1
data['skip_1'] = data['skip_1']*1
data['skip_2'] = data['skip_2']*1
data['skip_3'] = data['skip_3']*1

In [None]:
# Normalize
feature_name = ['duration',
 'release_year',
 'us_popularity_estimate',
 'flatness',
 'loudness',
 'tempo',
 'acoustic_vector_0',
 'acoustic_vector_1',
 'acoustic_vector_2',
 'acoustic_vector_3',
 'acoustic_vector_4',
 'acoustic_vector_5',
 'acoustic_vector_6',
 'acoustic_vector_7',
 'key']

data = normalize(data, feature_name)

In [None]:
# Train Test Split
train, val = split_sessions(data, 0.75)

In [None]:
train.head()

In [None]:
def create_matrix(data):
    data1 = data[data.session_position <= (data.session_length / 2)]
    data2 = data[data.session_position > (data.session_length / 2)]
    
    # Split into first and second part
    start_sessions = data1.groupby("session_id")
    end_sessions = data2.groupby("session_id")
    
    # Arrays
    X1, X2, y = [], [], []
    
    # For first part
    for session_id in tqdm(start_sessions.groups.keys()):
        # Get columns based on group
        session = start_sessions.get_group(session_id)[track_num_columns + ["skip_2"]]
        # Set skip_2
        session["skip_2"] = session["skip_2"] * 1
        # print(session.info())
        x = session.to_numpy()
        # print(x)
        # Padding operation
        X1.append(np.pad(x, ((0, 10 - len(x)), (0, 0)), 'constant', constant_values=(0)))
        # print(X1)
    
    # For second part
    for session_id in tqdm(end_sessions.groups.keys()):
        session = end_sessions.get_group(session_id)[track_num_columns_old + ["skip_2"]]
        # Set 
        true_y = session["skip_2"].to_numpy() * 1
        # print(session)
        # remove skip_2 from second part
        del session["skip_2"]
        x = session.to_numpy()
        X2.append(np.pad(x, ((0, 10 - len(x)), (0, 0)), 'constant', constant_values=(0)))
        y.append(np.pad(true_y, (0, 10 - len(true_y)), 'constant', constant_values=(0)))
    
    X1 = np.array(X1)
    X2 = np.array(X2)
    y = np.array(y)
    
    return X1,X2,y

X1,X2,y = create_matrix(train)

X1_v,X2_v,y_v = create_matrix(val)

In [None]:
# """## Loading numpy arrays (from archives)
# This is alternative to loading from Google Drive.
# Can be used to speed up experimentation once archives have been created.
# """

# train_files = [] # Archive files to use for training
# test_files = []  # Archive files to use for testing

# # Load training files
# X1 = None
# X2 = None
# y = None
# for fname in train_files:
#   archive = np.load(fname)
#   if X1 is None:
#     # This is the first file, initialise the arrays
#     X1 = archive['X1']
#     X2 = archive['X2']
#     y = archive['y']
#   else:
#     # Concatenate the arrays
#     X1 = np.concatenate(X1, archive['X1'])
#     X2 = np.concatenate(X1, archive['X2'])
#     y = np.concatenate(X1, archive['y'])

# X1_v = None
# X2_v = None
# y_v = None
# for fname in test_files:
#   archive = np.load(fname)
#   if X1 is None:
#     # This is the first file, initialise the arrays
#     X1 = archive['X1']
#     X2 = archive['X2']
#     y = archive['y']
#   else:
#     # Concatenate the arrays
#     X1 = np.concatenate(X1, archive['X1'])
#     X2 = np.concatenate(X1, archive['X2'])
#     y = np.concatenate(X1, archive['y'])

## Model

In [None]:
# Network 1 - First half of the session
input_layer1 = layers.Input(shape=(10, X1.shape[2]))

# Normalisation
norm1 = layers.BatchNormalization()(input_layer1)

# Recurrent layer(s)
lstm1 = layers.Bidirectional(layers.GRU(25, return_sequences=False, input_shape=(10, X1.shape[2])))(norm1)

# Network 2 - Second half of the session
input_layer2 = layers.Input(shape=(10, X2.shape[2]))

# Normalisation
norm2 = layers.BatchNormalization()(input_layer2)

# Recurrent layer(s)
lstm2 = layers.Bidirectional(layers.GRU(25, return_sequences=False, input_shape=(10, X2.shape[2])))(norm2)

# Concatenation & dense layer
concat = layers.Concatenate()([lstm1, lstm2])
dense_last = layers.Dense(10, activation="relu")(concat)

In [None]:
# Compile
lossf = keras.losses.MeanAbsoluteError()
model = keras.Model(inputs=[input_layer1, input_layer2], outputs=[dense_last])
model.compile(optimizer='adam', loss=lossf, metrics=["acc"])

history = model.fit([X1, X2], y, epochs=15, batch_size=100)

In [None]:
model.summary()

In [None]:
# Predict
predictions = model.predict([X1_v, X2_v])

evaluate([[i >= 0.5 for i in p] for p in predictions], y_v)
