In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

import gc
import datetime
from tqdm import tqdm
import warnings
warnings.simplefilter(action='ignore')

import tensorflow as tf
from keras.layers import *
from keras.models import Model, Sequential, load_model
from keras.optimizers import *
from keras.callbacks import ModelCheckpoint
from keras.activations import *
from keras.layers.advanced_activations import *
from keras import regularizers

Using TensorFlow backend.


In [2]:
train = pd.read_csv('../input/charbusters/ChartbustersParticipantsData/Data_Train.csv')
test = pd.read_csv('../input/charbusters/ChartbustersParticipantsData/Data_Test.csv')
sample_sub = pd.read_excel('../input/charbusters/ChartbustersParticipantsData/Sample_Submission.xlsx')

In [3]:
ID_COL, TARGET_COL = 'Unique_ID', 'Views'
print(train.shape, test.shape)
target = train[TARGET_COL]
train.drop(TARGET_COL, axis=1, inplace=True)
df = pd.concat([train, test])
print(df.shape)
df = df.drop('Country', axis=1)
features = [c for c in df.columns if c not in [ID_COL, TARGET_COL]]

(78458, 11) (19615, 10)
(98073, 10)


In [4]:
# nums.fill('na')
errors = []
def clean_nums(x):
    if ',' in x:
        x = x.replace(',', '')
    if 'K' in x:
        x = x.replace('K', '')
        x = float(x) * 10**3
    elif 'M' in x:
        x = x.replace('M', '')
        x = float(x) * 10**6
    try: return float(x)
    except:
        errors.append(x)
        return np.nan

In [5]:
num_cols = ['Comments', 'Likes', 'Popularity', 'Followers']
df[num_cols] = df[num_cols].apply(lambda x: x.apply(lambda x: clean_nums(str(x))))
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
cat_cols = ['Name', 'Genre']
df['Song_Name'] = df['Song_Name'].fillna('xxxx')
df['song_length'] = df['Song_Name'].apply(lambda x: len(x))
df = df.drop('Song_Name', axis=1)
df[cat_cols] = df[cat_cols].apply(lambda x: pd.factorize(x, sort=True)[0])

In [6]:
def get_new_columns(name,aggs):
    new_cols_list = []
    for k in aggs.keys():
        for agg in aggs[k]:
            if isinstance(agg, str):
                new_cols_list.append(name + '_' + k + '_' + agg)
            else:
                new_cols_list.append(name + '_' + k + agg.__name__)     
    return new_cols_list

aggs = {}
aggs['Comments'] = ['mean', 'min', 'max']
aggs['Likes'] = ['mean', 'min', 'max']
aggs['Popularity'] = ['mean', 'min', 'max']
aggs['Followers'] = ['mean', 'min', 'max']

new_columns = get_new_columns('name_',aggs)
name_grp = df.groupby('Name').agg(aggs)
name_grp.columns = new_columns
name_grp.reset_index(drop=False,inplace=True)
df = pd.merge(df, name_grp, on='Name', how = 'left')

In [7]:
df['likes_per_follower'] = df['Likes']/(df['Followers'] + 1e-5)
df['likes_per_comment'] = df['Likes']/df['Comments']
df['likes_per_popularity'] = df['Likes']/df['Popularity']
df['comments_per_follower'] = df['Comments']/df['Followers']
df['Popularity_per_follower'] = df['Popularity']/df['Followers']
df['comments_per_popularity'] = df['Comments']/df['Popularity']

In [8]:
df['likes_prod_follower'] = df['Likes']*(df['Followers'] + 1e-5)
df['likes_prod_comment'] = df['Likes']*df['Comments']
df['likes_prod_popularity'] = df['Likes']*df['Popularity']
df['comments_prod_follower'] = df['Comments']*df['Followers']#
df['Popularity_prod_follower'] = df['Popularity']*df['Followers']#.head(2)
df['comments_prod_popularity'] = df['Comments']*df['Popularity']

In [9]:
df['year'] = df['Timestamp'].dt.year
df['day'] = df['Timestamp'].dt.day
df['weekofyear'] = df['Timestamp'].dt.weekofyear
df['month'] = df['Timestamp'].dt.month
df['dayofweek'] = df['Timestamp'].dt.dayofweek
df['weekend'] = (df['Timestamp'].dt.weekday >=5).astype(int)
df['days_since_release'] = (datetime.datetime(2019, 8, 1) - df['Timestamp']).dt.days
df['hour'] = df['Timestamp'].dt.hour
df['minute'] = df['Timestamp'].dt.minute

In [10]:
temp = df.sort_values(by=['Name', 'Timestamp'])
temp['artist_next_album'] = temp.groupby('Name')['days_since_release'].shift(0) - temp.groupby('Name')['days_since_release'].shift(-1)
temp['artist_prev_album'] = temp.groupby('Name')['days_since_release'].shift(0) - temp.groupby('Name')['days_since_release'].shift(1)
temp['artist_next_album'] = temp['artist_next_album'].fillna(0)
temp['artist_prev_album'] = temp['artist_prev_album'].fillna(0)*-1
df = pd.merge(df, temp[[ID_COL, 'artist_next_album', 'artist_prev_album']], on = ID_COL, how = 'left')

In [11]:
temp = df.sort_values(by=['Genre', 'Timestamp'])
temp['genre_next_album'] = temp.groupby('Genre')['days_since_release'].shift(0) - temp.groupby('Genre')['days_since_release'].shift(-1)
temp['genre_prev_album'] = temp.groupby('Genre')['days_since_release'].shift(0) - temp.groupby('Genre')['days_since_release'].shift(1)
temp['genre_next_album'] = temp['genre_next_album'].fillna(0)
temp['genre_prev_album'] = temp['genre_prev_album'].fillna(0)*-1
df = pd.merge(df, temp[[ID_COL, 'genre_next_album', 'genre_prev_album']], on = ID_COL, how = 'left')

In [12]:
temp = df.sort_values(by=['Name', 'Timestamp'])
temp['Popularity_in_next_album'] = temp.groupby('Name')['Popularity'].shift(0) - temp.groupby('Name')['Popularity'].shift(-1)
temp['Popularity_in_prev_album'] = temp.groupby('Name')['Popularity'].shift(0) - temp.groupby('Name')['Popularity'].shift(1)
temp['Popularity_in_next_album'] = temp['Popularity_in_next_album'].fillna(0)
temp['Popularity_in_prev_album'] = temp['Popularity_in_prev_album'].fillna(0)
df = pd.merge(df, temp[[ID_COL, 'Popularity_in_next_album', 'Popularity_in_prev_album']], on = ID_COL, how = 'left')

In [13]:
temp = df.sort_values(by=['Name', 'Timestamp'])
temp[['Prev_Popularity', 'Prev_Likes', 'Prev_Followers']] = temp.groupby('Name')[['Popularity', 'Followers', 'Likes']].shift(-1)
df = pd.merge(df, temp[[ID_COL, 'Prev_Popularity', 'Prev_Likes', 'Prev_Followers']], on = ID_COL, how = 'left')

In [14]:
temp  = df.sort_values(by = ['Name', 'Timestamp']).set_index('Timestamp')
res = temp.groupby('Name').rolling('30D')[['Likes', 'Popularity', 'Followers']].mean().reset_index(drop=True)
res.columns = [c + '_Name_rolling' for c in res.columns]
res[ID_COL] = temp[ID_COL].values
df = pd.merge(df, res, on = ID_COL, how='left')

In [15]:
df['year_month'] = pd.factorize(df['year'].astype('str') + '_' + df['month'].astype('str'))[0]

In [16]:
for c in ['Comments', 'Likes', 'Popularity', 'Followers']:
    df[c + '_yearwise_mean'] = df['year'].map(df[[c, 'year']].groupby('year')[c].mean())
    df[c + '_yearwise_max'] = df['year'].map(df[[c, 'year']].groupby('year')[c].max())

In [17]:
df['seconds_since_release'] = (datetime.datetime(2019, 8, 1) - df['Timestamp']).dt.seconds

In [18]:
cat_cols = ['Name', 'Genre', 'year_month']

In [19]:
freq_cols = [c + '_freq' for c in cat_cols]
df[freq_cols] = df[cat_cols].apply(lambda x: x.map(x.value_counts()))

features = [c for c in df.columns if c not in [ID_COL, TARGET_COL, 'Timestamp']]
print(len(features))

num_cols = [c for c in features if c not in cat_cols]
print(len(num_cols))

df[num_cols].isnull().sum()[df[num_cols].isnull().sum() > 0]

65
62


likes_per_comment          2296
likes_per_popularity       2323
comments_per_popularity    7100
Prev_Popularity            1219
Prev_Likes                 1219
Prev_Followers             1219
dtype: int64

In [20]:
def remove_inf(x):
    x[x == np.inf] = 0
    return x
df[num_cols] = df[num_cols].apply(lambda x: x.fillna(x.mean()))
df[num_cols] = df[num_cols].apply(lambda x: remove_inf(x))

In [21]:
df[num_cols] = df[num_cols].apply(lambda x: x.fillna(x.mean()))
df[num_cols] = df[num_cols].apply(lambda x: remove_inf(x))

df[num_cols].isnull().sum()[df[num_cols].isnull().sum() > 0]

Series([], dtype: int64)

In [22]:
to_do_log_num_cols = [c for c in num_cols if (df[c] < 0).sum() == 0]
log_num_cols = [c +'_log' for c in num_cols if (df[c] < 0).sum() == 0]
print(len(log_num_cols))
df[log_num_cols] = df[to_do_log_num_cols].apply(lambda x: np.log1p(x))
num_cols = list(set(num_cols + log_num_cols))

60


In [23]:
def scale(x):
    return (x - x.mean())/(1e-5 + x.std())
df[num_cols] = df[num_cols].apply(lambda x: scale(x))
df[num_cols].isnull().sum()[df[num_cols].isnull().sum() > 0]

Series([], dtype: int64)

In [24]:
train, test = df.iloc[:train.shape[0]], df.iloc[train.shape[0]:]
test = test.reset_index(drop=True)

cat_cols = ['Name', 'Genre', 'year_month']
num_cols = list(set(num_cols))
print(len(num_cols))

122


In [25]:
embed_cols = cat_cols
non_embed_cols = num_cols
nunique_dict = {}
for c in embed_cols:
    nunique_dict[c] = train[c].nunique()
    print(c + ': %d values' % nunique_dict[c])

Name: 1209 values
Genre: 21 values
year_month: 195 values


In [26]:
features = list(set(embed_cols + non_embed_cols))
len(features), df.shape

(125, (98073, 127))

In [27]:
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

def np_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


class PredictionCallback(tf.keras.callbacks.Callback):
    def __init__(self, metric_info, mode = 'min', lr_decay_rate = 1, max_patience = 2, inp_size=1, file_path = 'best_model.hdf5'):
        self.metric_name = metric_info[0]
        self.metric = metric_info[1]
        self.mode = mode
        self.results = []
        self.max_patience = max_patience
        self.patience = 0
        self.lr_decay_rate = lr_decay_rate
        self.file_path = file_path
        self.inp_size = inp_size
        
        
    def on_epoch_end(self, epoch, logs={}, **kwargs):
        y_pred = self.model.predict(self.validation_data[0: self.inp_size])
        y_true = self.validation_data[self.inp_size]
        
        res = self.metric(y_true, y_pred)
        
        if epoch == 0:
            print('->' * 22 + ' EPOCH : {}  *** {} *** : {}'.format(epoch + 1, self.metric_name, res))
#             print('\nSaving Model\n')
            self.model.save(self.file_path)
        
        else:
            best = min(self.results + [res]) if self.mode == 'min' else max(self.results + [res])
            print('->' * 22 + ' EPOCH : {}  *** {} *** : {} *** BEST *** : {}'.format(epoch + 1, self.metric_name, res, best))
            if (self.mode == 'min' and res >= min(self.results)) or (self.mode == 'max' and res <= max(self.results)):
                self.patience += 1

                if self.patience >= self.max_patience:
                    lr = self.model.optimizer.lr
                    new_lr = K.get_value(lr) * self.lr_decay_rate
                    K.set_value(lr, new_lr)
                    print("\nMetric did not improve for {} iterations. Changing learning rate to: {}\n".format(self.max_patience, new_lr))
                    self.patience = 0    
            else:
#                 print('\nSaving Model\n')
                self.model.save(self.file_path)
            
        self.results.append(res)
        print('')

In [28]:
def build_embedding_network(categorical_vars, max_embed_size=50):
    embeddings = []
    inputs = []

    for categorical_var in categorical_vars:    
        single_input = Input(shape=(1,))
        no_of_unique_cat  = nunique_dict[categorical_var]
        embedding_size = min(np.ceil((no_of_unique_cat)/2), max_embed_size)
        embedding_size = max(10, embedding_size)
        embedding_size = int(embedding_size)
        embedding = Embedding(no_of_unique_cat, embedding_size, input_length=1, trainable=True)(single_input)
        embedding = Reshape(target_shape=(embedding_size,))(embedding)
        inputs.append(single_input)
        embeddings.append(embedding)

    input_numeric = Input(shape=(len(non_embed_cols),))
    inputs.append(input_numeric)
    embedding_numeric = Dense(32)(input_numeric) 
    embeddings.append(input_numeric)
    
    x = Concatenate()(embeddings)
    for i in range(5):
        x = Dense(256*9)(x)
        x =  ReLU()(x)
        x = Dropout(0.27)(x)
    output = Dense(1, activation='linear')(x)
    model = Model(inputs, output)
    opt = Nadam(lr=1e-3)
    model.compile(loss=rmse, optimizer=opt)
    
    return model

In [29]:
fold_split_col = pd.Series(pd.factorize(train['Name'].astype('str') + train['year'].astype('str') + train['Genre'].astype('str'))[0])
max_iter = 7
folds = StratifiedKFold(n_splits=max_iter, random_state=1991)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
file_path = 'best_model.hdf5'


for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, fold_split_col)):
    print("fold n°{}".format(fold_))
    X_trn,  y_trn  = train.loc[trn_idx][features], target[trn_idx]
    X_val, y_val = train.loc[val_idx][features], target[val_idx]
    if fold_ < 2:
        continue
    val_maps = {}
    for c in embed_cols:
        val_maps[c] = pd.Series(np.unique(X_trn[c]))
        val_maps[c] = pd.Series(data = val_maps[c].index, index = val_maps[c].values)
    proc_X_trn = []
    for c in embed_cols:
        proc_X_trn.append(X_trn[c].map(val_maps[c]).values)
    proc_X_trn.append(X_trn[non_embed_cols].values)
    proc_X_val = []
    for c in embed_cols:
        proc_X_val.append(X_val[c].map(val_maps[c]).values)
    proc_X_val.append(X_val[non_embed_cols].values)
    proc_X_test = []
    for c in embed_cols:
        proc_X_test.append(test[c].map(val_maps[c]).values)
    proc_X_test.append(test[non_embed_cols].values)
    NN = build_embedding_network(embed_cols, 10)
    
    pcb = PredictionCallback(('RMSE', np_rmse), inp_size=len(proc_X_val), file_path = file_path, max_patience = 3, lr_decay_rate = 0.7)
    callbacks_list = [pcb]
    
    history = NN.fit(proc_X_trn, y_trn, batch_size=100, epochs=25, validation_data = [proc_X_val, y_val], callbacks=callbacks_list)
    NN = load_model(file_path,  custom_objects={'rmse':rmse})
    
    oof[val_idx] = NN.predict(proc_X_val, 256*64)[:,0]
    print("Fold score: {:<8.5f}".format(np.sqrt(mean_squared_error(y_val, oof[val_idx]))))
    
    current_pred = NN.predict(proc_X_test, 256*64)[:,0]
    print((current_pred < 0).sum())
    current_pred[current_pred < 0] = 0
    
    sub_df = pd.DataFrame()
    sub_df[ID_COL] = test[ID_COL]
    sub_df[TARGET_COL] = current_pred
    sub_df.to_excel('nn_ ' + str(fold_) + '.xlsx', index=False)
    
    break

fold n°0
fold n°1
fold n°2
Train on 66731 samples, validate on 11727 samples
Epoch 1/25
->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 1  *** RMSE *** : 662757.0538417881

Epoch 2/25
->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 2  *** RMSE *** : 589426.3334266266 *** BEST *** : 589426.3334266266

Epoch 3/25
->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 3  *** RMSE *** : 764489.5944089866 *** BEST *** : 589426.3334266266

Epoch 4/25
->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 4  *** RMSE *** : 605868.0387239312 *** BEST *** : 589426.3334266266

Epoch 5/25
->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 5  *** RMSE *** : 553484.6254531835 *** BEST *** : 553484.6254531835

Epoch 6/25
->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 6  *** RMSE *** : 549183.667323038 *** BEST *** : 549183.667323038

Epoch 7/25
->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 7  *** RMSE *** : 616467.7797778473 *** BEST *** : 549183.667323038

Metric did not improv

In [30]:
print((current_pred < 0).sum())
current_pred[current_pred < 0] = 0

0


In [31]:
print("Fold score: {:<8.5f}".format(np.sqrt(mean_squared_error(y_val, oof[val_idx]))))

Fold score: 379582.70857


In [32]:
sub_df.head(10)

Unnamed: 0,Unique_ID,Views
0,562546,180370.1875
1,907584,31119.177734
2,213013,12470.263672
3,340312,29852.271484
4,41854,16728.867188
5,1031846,5861.603027
6,627446,4253.577637
7,1509777,19816.429688
8,1270147,3305.786621
9,1486926,36350.460938
