In [1]:
import gc

import re

import numpy as np
import pandas as pd

## Featurization

In [2]:
train = pd.read_csv('./data/train.csv')
members = pd.read_csv('./data/members.csv')
songs = pd.read_csv('./data/songs.csv')
song_extra_info = pd.read_csv('./data/song_extra_info.csv')

In [3]:
songs = songs.merge(song_extra_info, on='song_id', how='left')

In [4]:
del song_extra_info
gc.collect()

0

In [5]:
# For getting nth genre/artist/composer/lyricist with possible missing values
def get_nth(list_, n):
    if len(list_) < n + 1:
        return '__MISSING__'
    return list_[n]

In [6]:
def featurize_songs(songs):
    songs['song_id'] = songs['song_id'].astype('category')
    songs['song_length'] = songs['song_length'].astype(np.int32)
    
    # Compile splitting pattern (I hope it covers all the cases)
    split_pattern = re.compile('[\/\\\|\;]+\W*')
    
    songs.rename({'genre_ids': 'genre_id'}, axis=1, inplace=True)
    cols_to_drop = ['name', 'isrc']
    
    # Get 1st, 2nd, 3rd id/name and count and convert it to "category" dtype
    for col in ('genre_id', 'artist_name', 'composer', 'lyricist'):
        cols_to_drop.append(col)
        songs[col] = songs[col].fillna('__MISSING__')
        songs['1st_' + col] = songs[col].apply(
            lambda x: get_nth(re.split(split_pattern, x), 0)
        ).astype('category')
        songs['2nd_' + col] = songs[col].apply(
            lambda x: get_nth(re.split(split_pattern, x), 1)
        ).astype('category')
        songs['3rd_' + col] = songs[col].apply(
            lambda x: get_nth(re.split(split_pattern, x), 2)
        ).astype('category')
        songs['n_' + col + 's'] = songs[col].apply(
            lambda x: len(re.split(split_pattern, x))
        )
        songs['n_' + col + 's'] -= (songs[col] == '__MISSING__')
        songs['n_' + col + 's'] = songs['n_' + col + 's'].astype(np.uint8)
    
    # https://en.wikipedia.org/wiki/International_Standard_Recording_Code
    songs['isrc'] = songs['isrc'].astype(str)
    songs['country_code'] = songs['isrc'].apply(
        lambda x: x[:2] if x != 'nan' else '__MISSING__'
    ).astype('category')
    songs['registrant_code'] = songs['isrc'].apply(
        lambda x: x[2:5] if x != 'nan' else '__MISSING__'
    ).astype('category')
    songs['year_of_reference'] = songs['isrc'].apply(
        lambda x: x[5:7] if x != 'nan' else '__MISSING__'
    ).astype('category')
    songs['designation_code'] = songs['isrc'].apply(
        lambda x: x[7:] if x != 'nan' else '__MISSING__'
    ).astype('category')
    
    # Make labels start from 0 for LightGBM
    songs['language'] = (songs['language'].fillna(-1) + 1).astype(np.uint8)

    songs.drop(cols_to_drop, axis=1, inplace=True)
    

In [7]:
def featurize_members(members):
    cols_to_drop = ['bd']
    # Age binning to 5 groups, outlier removal
    members['age_group'] = np.digitize(members['bd'], [18, 30, 50]).astype(np.uint8)
    # Replace NaNs
    members['gender'] = members['gender'].fillna('__MISSING__')
    
    # Convert to "category" dtype
    for col in ['msno', 'city', 'gender', 'registered_via']:
        members[col] = members[col].astype('category')
    
    # Keep year only to prevent overfitting
    for col in ['registration_init_time', 'expiration_date']:
        cols_to_drop.append(col)
        members[col.split('_', 1)[0]] = (members[col] // 1e4).astype(np.uint16)
        
    members.drop(cols_to_drop, axis=1, inplace=True)

In [8]:
def featurize_train(train):
    # Fill NaNs and downcast dtypes
    cols = ['msno', 'song_id', 'source_system_tab', 'source_screen_name', 'source_type']
    train[cols] = train[cols].fillna('__MISSING__').astype('category')
    train['target'] = train['target'].astype(np.uint8)

In [9]:
featurize_songs(songs)
featurize_members(members)

In [10]:
train = train.merge(songs, on='song_id', how='left').merge(members, on='msno', how='left')

In [11]:
del songs, members
gc.collect()

0

In [12]:
featurize_train(train)

In [13]:
target = train['target']
del train['target']
train['target'] = target

In [28]:
train.dropna(inplace=True)

In [37]:
train['song_length'] = train['song_length'].astype(np.uint32)

cols = ['language', 'n_genre_ids', 'n_artist_names', 'n_composers', 'n_lyricists']
train[cols] = train[cols].astype(np.uint8)

In [40]:
train.duplicated().sum()  # duplicates check

0

In [42]:
# dtypes change after merge
train[['msno', 'song_id']] = train[['msno', 'song_id']].astype('category')

In [43]:
# Processed dataset: 32 features and 1 target
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377304 entries, 0 to 7377417
Data columns (total 34 columns):
 #   Column              Dtype   
---  ------              -----   
 0   msno                category
 1   song_id             category
 2   source_system_tab   category
 3   source_screen_name  category
 4   source_type         category
 5   song_length         uint32  
 6   language            uint8   
 7   1st_genre_id        category
 8   2nd_genre_id        category
 9   3rd_genre_id        category
 10  n_genre_ids         uint8   
 11  1st_artist_name     category
 12  2nd_artist_name     category
 13  3rd_artist_name     category
 14  n_artist_names      uint8   
 15  1st_composer        category
 16  2nd_composer        category
 17  3rd_composer        category
 18  n_composers         uint8   
 19  1st_lyricist        category
 20  2nd_lyricist        category
 21  3rd_lyricist        category
 22  n_lyricists         uint8   
 23  country_code        category
 24

In [44]:
# Save featurized dataset as parquet to save dtypes info
train.to_parquet('./data/featurized.parquet')