In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score as AUC
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf

In [2]:
def data_split(data, validation_ratio = 0.15, test_ratio = 0.15):
    """
    Function to split data into train, validation and test based on timestamps
    
    https://stackoverflow.com/questions/42395258/
    
    """
    train_ratio = 1 - validation_ratio - test_ratio
    
    data['time_rank'] = data.groupby('userid')['timestamp'].rank()
    data['user_all_songs_count'] = data['userid'].map(data.groupby('userid')['timestamp'].apply(len))
    data['scaled_time_rank'] = data['time_rank']/ data['user_all_songs_count']
    
    data.drop(['time_rank', 'user_all_songs_count'], axis=1, inplace=True)
    
    train_data = data.loc[data['scaled_time_rank'] <= train_ratio, :]
    validation_data = data.loc[(data['scaled_time_rank'] <= (1 - test_ratio)) & (data['scaled_time_rank'] > train_ratio), :]
    test_data = data.loc[(data['scaled_time_rank'] > (train_ratio + validation_ratio)), :]
    train_data.drop(['scaled_time_rank', 'timestamp', 'userid'], axis=1, inplace=True)
    validation_data.drop(['scaled_time_rank', 'timestamp', 'userid'], axis=1, inplace=True)
    test_data.drop(['scaled_time_rank', 'timestamp', 'userid'], axis=1, inplace=True)
    return train_data, validation_data, test_data
    

In [3]:
pandas_df = pd.read_csv('data_engineered_features500.csv') 
pandas_df["gender_int"] = 0
pandas_df.loc[pandas_df["gender"] == "m", "gender_int"] = 1
pandas_df.loc[pandas_df["gender"] == "f", "gender_int"] = 2
pandas_df.drop(["track-name","artist-name", "songlength", "gender"], axis=1, inplace=True)
pandas_df = pandas_df.fillna(0)

In [4]:
pandas_df.head()

Unnamed: 0,userid,timestamp,weekday,hour,weekend,daytime,user-track-total-count,track-weekday-count,track-daytime-count,last-seen-song,...,user-song-skip-percentage,user-artist-skips,user-artist-skip-percentage,global-song-skips,global-artist-skips,artist_total_count,song_total_count,global-song-skip-percentage,global-artist-skip-percentage,gender_int
0,user_000001,2006-08-13 13:59:20,6,13,1,3,1,1,1,0.0,...,0.0,0,0.0,0,0,1,1,0.0,0.0,1
1,user_000001,2006-08-13 14:03:29,6,14,1,3,1,1,1,0.0,...,0.0,0,0.0,0,0,2,1,0.0,0.0,1
2,user_000001,2006-08-13 14:10:43,6,14,1,3,1,1,1,0.0,...,0.0,0,0.0,0,0,3,1,0.0,0.0,1
3,user_000001,2006-08-13 15:44:17,6,15,1,3,1,1,1,0.0,...,0.0,0,0.0,0,0,1,1,0.0,0.0,1
4,user_000001,2006-08-13 16:46:52,6,16,1,3,1,1,1,0.0,...,1.0,1,0.333333,1,1,3,1,1.0,1.0,1


In [5]:
train_data, validation_data, test_data = data_split(pandas_df, validation_ratio = 0.15, test_ratio = 0.15)

In [None]:
train_y = np.array(train_data["skipped"])
validation_y = np.array(validation_data["skipped"])
test_y = np.array(test_data["skipped"])

train_data.drop(["skipped"], axis=1, inplace=True)
validation_data.drop(["skipped"], axis=1, inplace=True)
test_data.drop(["skipped"], axis=1, inplace=True)

features_list = list(train_data)
train_data = (train_data - pandas_df.mean())/pandas_df.std()
validation_data = (train_data - pandas_df.mean())/pandas_df.std()
test_data = (test_data - pandas_df.mean())/pandas_df.std()

In [None]:
del pandas_df

In [None]:
np.shape(train_data)

In [None]:
def neural_net_model():
    # create model
    model = tf.keras.models.Sequential()
    
    model.add(tf.keras.layers.Dense(10, input_dim=14, activation='relu',
                                    kernel_initializer='glorot_normal'))
    
    #model.add(tf.keras.layers.Dropout(0.1)
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    # Compile model
    sgd = tf.keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

In [None]:
# Define the Neural Network model
# Using Scikit-Learn wrapper in Keras, which is now in Tensorflow
deep_net = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=neural_net_model, epochs=5, 
                                                          batch_size=128, verbose=0)

In [None]:
linear_model = LR(penalty='l2', dual=False, tol=0.001, C=1, fit_intercept=True, 
                  intercept_scaling=1, class_weight=None, random_state=42, 
                  solver='liblinear', max_iter=50, multi_class='ovr', 
                  verbose=0, warm_start=False, n_jobs=1)

In [None]:
#oof_prediction = cross_val_predict(linear_model, np.array(pandas_df), y, cv=5, method='predict_proba')

### Train the NN

In [None]:
deep_net.fit(np.array(train_data), train_y)

In [None]:
val_prediction = deep_net.predict_proba(np.array(validation_data))[:,1]

In [None]:
val_auc = AUC(validation_y, val_prediction)
print("Validation AUC Score:", val_auc)