In [1]:
import collections
import os
import multiprocessing
import sys
from datetime import datetime

import shap
import pandas as pd
import numpy as np
import lightgbm as lgb
import sklearn.metrics
import ml_metrics
import tqdm
from gensim.models import Word2Vec

In [2]:
! pwd

/c/homeworks/hse_recsys2020/HomeWorks


In [3]:
! ls ../data

members.csv
sample_submission.csv
song_extra_info.csv
songs.csv
test.csv
train.csv


In [4]:
train_df = pd.read_csv('../data/train.csv', dtype={
  'msno': 'category',
  'song_id': 'category',
  'source_system_tab': 'category',
  'source_screen_name': 'category',
  'source_type': 'category',
  'target': np.float,
})
songs_df = pd.read_csv('../data/songs.csv', dtype={
  'song_id': 'category',
  'song_length': np.int,
  'genre_ids': 'category',
  'artist_name': 'category',
  'composer': 'category',
  'lyricist': 'category',
  'language': 'category',
})
members_df = pd.read_csv('../data/members.csv', dtype={
  'msno': 'category',
  'city': 'category',
  'bd': np.int,
  'gender': 'category',
  'registered_via': 'category',
  'registration_init_time': str,
  'expiration_date': str,
})
song_extra_info_df = pd.read_csv('../data/song_extra_info.csv', dtype={
  'song_id': 'category',
  'name': 'category',
  'isrc': 'category',
})

In [5]:
def split_to_date(df, col, col_year, col_month, col_day):
    df[col_year] = df[col].apply(lambda date: int(date[0:4]))
    df[col_month] = df[col].apply(lambda date: int(date[4:6]))
    df[col_day] = df[col].apply(lambda date: int(date[6:8]))
    df.drop([col], axis=1)

In [6]:
def isrc_to_year(isrc):
    if isrc != "":
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

In [7]:
split_to_date(members_df, 'registration_init_time', 'registration_year', 'registration_month', 'registration_day')
split_to_date(members_df, 'expiration_date', 'expiration_year', 'expiration_month', 'expiration_day')

In [8]:
song_extra_info_df['song_year'] = song_extra_info_df['isrc'].apply(isrc_to_year)
song_extra_info_df.drop(['isrc', 'name'], axis=1, inplace=True)

In [9]:
song_columns_to_use = ['song_id', 'artist_name', 'genre_ids', 'song_length', 'language']
train_df = train_df.merge(songs_df[song_columns_to_use], on='song_id', how='left')

In [10]:
train_df = train_df.merge(members_df, on='msno', how='left')

In [11]:
train_df = train_df.merge(song_extra_info_df, on='song_id', how='left')

In [12]:
train_df.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,artist_name,genre_ids,song_length,language,...,registered_via,registration_init_time,expiration_date,registration_year,registration_month,registration_day,expiration_year,expiration_month,expiration_day,song_year
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1.0,Bastille,359,206471.0,52.0,...,7,20120102,20171005,2012,1,2,2017,10,5,2016.0
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1.0,Various Artists,1259,284584.0,52.0,...,9,20110525,20170911,2011,5,25,2017,9,11,1999.0
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1.0,Nas,1259,225396.0,52.0,...,9,20110525,20170911,2011,5,25,2017,9,11,2006.0
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1.0,Soundway,1019,255512.0,-1.0,...,9,20110525,20170911,2011,5,25,2017,9,11,2010.0
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1.0,Brett Young,1011,187802.0,52.0,...,7,20120102,20171005,2012,1,2,2017,10,5,2016.0


In [13]:
list(train_df.columns.values)

['msno',
 'song_id',
 'source_system_tab',
 'source_screen_name',
 'source_type',
 'target',
 'artist_name',
 'genre_ids',
 'song_length',
 'language',
 'city',
 'bd',
 'gender',
 'registered_via',
 'registration_init_time',
 'expiration_date',
 'registration_year',
 'registration_month',
 'registration_day',
 'expiration_year',
 'expiration_month',
 'expiration_day',
 'song_year']

In [14]:
len(set(train_df['genre_ids'].values))

573

In [15]:
len(set(train_df['song_id'].values))

359966

In [16]:
import catboost

In [17]:
X_train = train_df.drop(['target'], axis=1)
Y_train = train_df['target']

In [27]:
X_train['language'] = X_train['language'].astype('float').fillna(0).astype('int')
for column in ['source_system_tab', 'source_screen_name', 'source_type', 'artist_name', 'genre_ids', 'language', 'city', 'bd', 'gender', 'registered_via']:
    X_train[column] = X_train[column].fillna(X_train[column].mode(), inplace=True)

In [29]:
X_train.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'artist_name', 'genre_ids', 'song_length', 'language',
       'city', 'bd', 'gender', 'registered_via', 'registration_init_time',
       'expiration_date', 'registration_year', 'registration_month',
       'registration_day', 'expiration_year', 'expiration_month',
       'expiration_day', 'song_year'],
      dtype='object')

In [28]:
print(list(X_train.isnull().values.any(axis=0)))

[False, False, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, True]


In [22]:
X_train['language'].dtype

dtype('int32')

In [23]:
for col in X_train.columns:
    if X_train[col].dtype == object:
        X_train[col] = X_train[col].astype('category')
X_train['language'] = X_train['language'].fillna(0).astype('int')

In [24]:
train_df.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,artist_name,genre_ids,song_length,language,...,registered_via,registration_init_time,expiration_date,registration_year,registration_month,registration_day,expiration_year,expiration_month,expiration_day,song_year
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1.0,Bastille,359,206471.0,52.0,...,7,20120102,20171005,2012,1,2,2017,10,5,2016.0
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1.0,Various Artists,1259,284584.0,52.0,...,9,20110525,20170911,2011,5,25,2017,9,11,1999.0
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1.0,Nas,1259,225396.0,52.0,...,9,20110525,20170911,2011,5,25,2017,9,11,2006.0
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1.0,Soundway,1019,255512.0,-1.0,...,9,20110525,20170911,2011,5,25,2017,9,11,2010.0
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1.0,Brett Young,1011,187802.0,52.0,...,7,20120102,20171005,2012,1,2,2017,10,5,2016.0


In [25]:
model = catboost.CatBoostClassifier(iterations=100, verbose=False, depth=16)

In [None]:
pool = catboost.Pool(X_train, label=Y_train, cat_features=[0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 12])
model.fit(pool)