In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from tqdm import tqdm

In [2]:
train_data = pd.read_csv("train.csv")

members_data = pd.read_csv("members.csv")
songs_data = pd.read_csv("songs.csv")

## Данные сбалансированы

In [3]:
train_data['target'].value_counts()

1    3714656
0    3662762
Name: target, dtype: int64

## Делим на train-val-test без перемешивания, т.к. данные отсортированы по времени

In [4]:
test_size = 0.2
val_size = 0.2
train_size = 1 - test_size - val_size

train, val = train_test_split(train_data, train_size=train_size, shuffle=False, random_state=0)
val, test = train_test_split(val, train_size=val_size/(val_size+test_size), shuffle=False, random_state=0)

In [5]:
print(len(train), len(val), len(test))

4426450 1475484 1475484


In [6]:
# Function for encoding features with some dictionary
def encode_with_map(mapping, x):
    if x in mapping:
        return mapping[x]
    return -1

In [7]:
# Mapping msno to idx and vise-versa
idx_to_mnso = {}
msno_to_idx = {}

for idx, msno in enumerate(members_data['msno'].unique()):
    idx_to_mnso[idx] = msno
    msno_to_idx[msno] = idx

In [8]:
members_data['msno'] = members_data['msno'].apply(lambda x: encode_with_map(msno_to_idx, x))
train['msno'] = train['msno'].apply(lambda x: encode_with_map(msno_to_idx, x))
test['msno'] = test['msno'].apply(lambda x: encode_with_map(msno_to_idx, x))
val['msno'] = val['msno'].apply(lambda x: encode_with_map(msno_to_idx, x))

In [9]:
# Mapping song_id to idx and vise-versa
idx_to_song = {}
song_to_idx = {}

for idx, s_id in enumerate(songs_data['song_id'].unique()):
    idx_to_song[idx] = s_id
    song_to_idx[s_id] = idx

In [10]:
# Encoding with mapping
songs_data['song_id'] = songs_data['song_id'].apply(lambda x: encode_with_map(song_to_idx, x))
train['song_id'] = train['song_id'].apply(lambda x: encode_with_map(song_to_idx, x))
test['song_id'] = test['song_id'].apply(lambda x: encode_with_map(song_to_idx, x))
val['song_id'] = val['song_id'].apply(lambda x: encode_with_map(song_to_idx, x))

## Заполняем NaN в данных

In [11]:
source_system_tab_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

train['source_system_tab'] = source_system_tab_imputer.fit_transform(train[['source_system_tab']])
val['source_system_tab'] = source_system_tab_imputer.transform(val[['source_system_tab']])
test['source_system_tab'] = source_system_tab_imputer.transform(test[['source_system_tab']])

In [12]:
source_screen_name_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

train['source_screen_name'] = source_screen_name_imputer.fit_transform(train[['source_screen_name']])
val['source_screen_name'] = source_screen_name_imputer.transform(val[['source_screen_name']])
test['source_screen_name'] = source_screen_name_imputer.transform(test[['source_screen_name']])

In [13]:
source_type_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

train['source_type'] = source_type_imputer.fit_transform(train[['source_type']])
val['source_type'] = source_type_imputer.transform(val[['source_type']])
test['source_type'] = source_type_imputer.transform(test[['source_type']])

In [14]:
# Map NaN in gender to underfiend
members_data['gender'] = members_data['gender'].apply(lambda x: 'undefiend' if x is np.nan else x)

In [15]:
# Подразумеваю, что возраст меньше 10 не пользуется приложением для музыки (больше 100 явно выброс)
# Возраст энкодим так:
#    outlier (<10 or >100) -> 0
#    10-17    -> 1
#    18-29   -> 2
#    30-50   -> 3
#    51+     -> 4

def encode_bd(x):
    if x < 10 or x > 100:
        return 0
    if x >= 10 and x < 18:
        return 1
    if x >= 18 and x < 29:
        return 2
    if x >= 30 and x < 50:
        return 3
    return 4

members_data['bd'] = members_data['bd'].apply(encode_bd)

In [16]:
members_data

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date
0,0,1,0,undefiend,7,20110820,20170920
1,1,1,0,undefiend,7,20150628,20170622
2,2,1,0,undefiend,4,20160411,20170712
3,3,1,0,undefiend,9,20150906,20150907
4,4,1,0,undefiend,4,20170126,20170613
...,...,...,...,...,...,...,...
34398,34398,1,0,undefiend,7,20131111,20170910
34399,34399,4,2,male,3,20141024,20170518
34400,34400,1,0,undefiend,7,20130802,20170908
34401,34401,1,0,undefiend,7,20151020,20170920


In [17]:
train

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,21893,1391177,explore,Explore,online-playlist,1
1,26052,1584026,my library,Local playlist more,local-playlist,1
2,26052,456722,my library,Local playlist more,local-playlist,1
3,26052,1072225,my library,Local playlist more,local-playlist,1
4,21893,518476,explore,Explore,online-playlist,1
...,...,...,...,...,...,...
4426445,6976,28167,discover,Local playlist more,local-library,1
4426446,6976,136481,discover,Discover New,online-playlist,1
4426447,6976,143359,discover,Discover New,online-playlist,1
4426448,21998,564633,discover,Local playlist more,top-hits-for-artist,0


In [18]:
# NaN language map to -1 (can be case with no lyrics also)
songs_data['language'][songs_data['language'].isna()] = -1
songs_data['language'] = songs_data['language'].apply(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_data['language'][songs_data['language'].isna()] = -1


In [19]:
# NaN genre map to -1
songs_data['genre_ids'][songs_data['genre_ids'].isna()] = -1
# NaN string map to empty strings
songs_data['composer'][songs_data['composer'].isna()] = ''
songs_data['lyricist'][songs_data['lyricist'].isna()] = ''

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_data['genre_ids'][songs_data['genre_ids'].isna()] = -1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_data['composer'][songs_data['composer'].isna()] = ''
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_data['lyricist'][songs_data['lyricist'].isna()] = ''


In [20]:
songs_data

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,0,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3
1,1,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31
2,2,231781,465,SUPER JUNIOR,,,31
3,3,273554,465,S.H.E,湯小康,徐世珍,3
4,4,140329,726,貴族精選,Traditional,Traditional,52
...,...,...,...,...,...,...,...
2296315,2296315,20192,958,Catherine Collard,Robert Schumann (1810-1856),,-1
2296316,2296316,273391,465,紀文惠 (Justine Chi),,,3
2296317,2296317,445172,1609,Various Artists,,,52
2296318,2296318,172669,465,Peter Paul & Mary,,,52


In [21]:
# Split genre ids if several genres in the songs
genre_ids = []
for x in [str(x).split('|') for x in songs_data['genre_ids'].unique()]:
    genre_ids += x
genre_ids = [int(x) for x in genre_ids]
genre_ids = sorted(list(set(genre_ids)))

## Тут я хотел сначала энкодить через Bert, но данных слишком много и эмбеддинги выходят в 3-4гб на диске

In [22]:
#from transformers import BertTokenizer, BertModel
#import torch

In [23]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
#tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
#model = BertModel.from_pretrained("bert-base-chinese").to(device)

In [25]:
"""
text_corpus = list(songs_data['artist_name'])

embeddings = np.zeros((len(text_corpus), 768), dtype=np.float16)

batch_size = 64
for idx in tqdm(range(0, len(text_corpus), batch_size), total=len(text_corpus)//batch_size):
    end_idx = min(idx + batch_size, len(text_corpus) - 1)
    
    tokens = tokenizer.batch_encode_plus(text_corpus[idx:end_idx], max_length=20, pad_to_max_length=True)
    
    inp = torch.tensor(tokens['input_ids']).to(device)
    token_types = torch.tensor(tokens['token_type_ids']).to(device)
    attn_mask = torch.tensor(tokens['attention_mask']).to(device)
    
    emb = model(inp, token_type_ids=token_types, attention_mask=attn_mask)[0].detach().mean(axis=1).cpu()
    embeddings[idx:end_idx] = emb.numpy().astype(np.float16)
"""

"\ntext_corpus = list(songs_data['artist_name'])\n\nembeddings = np.zeros((len(text_corpus), 768), dtype=np.float16)\n\nbatch_size = 64\nfor idx in tqdm(range(0, len(text_corpus), batch_size), total=len(text_corpus)//batch_size):\n    end_idx = min(idx + batch_size, len(text_corpus) - 1)\n    \n    tokens = tokenizer.batch_encode_plus(text_corpus[idx:end_idx], max_length=20, pad_to_max_length=True)\n    \n    inp = torch.tensor(tokens['input_ids']).to(device)\n    token_types = torch.tensor(tokens['token_type_ids']).to(device)\n    attn_mask = torch.tensor(tokens['attention_mask']).to(device)\n    \n    emb = model(inp, token_type_ids=token_types, attention_mask=attn_mask)[0].detach().mean(axis=1).cpu()\n    embeddings[idx:end_idx] = emb.numpy().astype(np.float16)\n"

## Энкодим

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dinislam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dinislam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
def preprocess_corpus(corpus, col):
    
    stop = set(stopwords.words('english') + list(string.punctuation))
    
    def process_string(s):
        return " ".join([i for i in word_tokenize(s.lower()) if i not in stop])
    
    return corpus[col].apply(process_string)

In [28]:
%timeit preprocess_corpus(songs_data.iloc[0:100000][['artist_name']], 'artist_name')

8.52 s ± 391 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
songs_data['artist_name'] = preprocess_corpus(songs_data[['artist_name']], 'artist_name')

In [30]:
songs_data['composer'] = preprocess_corpus(songs_data[['composer']], 'composer')

In [31]:
songs_data['lyricist'] = preprocess_corpus(songs_data[['lyricist']], 'lyricist')

In [32]:
songs_data

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,0,247640,465,張信哲 jeff chang,董貞,何啟弘,3
1,1,197328,444,blackpink,teddy| future bounce| bekuh boom,teddy,31
2,2,231781,465,super junior,,,31
3,3,273554,465,s.h.e,湯小康,徐世珍,3
4,4,140329,726,貴族精選,traditional,traditional,52
...,...,...,...,...,...,...,...
2296315,2296315,20192,958,catherine collard,robert schumann 1810-1856,,-1
2296316,2296316,273391,465,紀文惠 justine chi,,,3
2296317,2296317,445172,1609,various artists,,,52
2296318,2296318,172669,465,peter paul mary,,,52


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = list(songs_data['artist_name'])
vectorizer = TfidfVectorizer()
artist_name_tfidf = vectorizer.fit_transform(corpus)

In [34]:
corpus = list(songs_data['composer'])
vectorizer = TfidfVectorizer()
composer_tfidf = vectorizer.fit_transform(corpus)

In [35]:
corpus = list(songs_data['lyricist'])
vectorizer = TfidfVectorizer()
lyricist_tfidf = vectorizer.fit_transform(corpus)

In [36]:
import scipy as sp
sp.sparse.save_npz('artist_name_tfidf.npz', artist_name_tfidf)
sp.sparse.save_npz('composer_tfidf.npz', composer_tfidf)
sp.sparse.save_npz('lyricist_tfidf.npz', lyricist_tfidf)

In [37]:
train.to_csv('train_processed.csv')
test.to_csv('test_processed.csv')
val.to_csv('val_processed.csv')
songs_data.to_csv('songs_data_processed.csv')
members_data.to_csv('members_data_processed.csv')