In [87]:
# Jieyu Chen is responsible for this code
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import math
from datetime import datetime
pd.set_option('display.max_columns', None)

In [88]:
df = pd.read_csv('all_data_raw.csv', index_col=0)

In [89]:
# show the data
df.head()

Unnamed: 0,play_count,energy,liveness,tempo,speechiness,acousticness,instrumentalness,time_signature,danceability,key,duration_ms,loudness,valence,mode,artist_name,song_name,artist_followers,artist_popularity,artist_genres,disc_number,track_id,explicit,country,album_release_date,album_total_tracks,album_available_markets,listener_count
0,57733.0,0.833,0.0659,125.07,0.0402,0.0156,0.0,4.0,0.556,5.0,224746,-5.29,0.374,1.0,ONE OK ROCK,Wasted Nights,2381898.0,72.0,"['j-pop', 'j-rock', 'visual kei']",1,51Jyu0jAfvawonJVbkCSuN,False,US,2019-06-14,35,1,8504.0
1,,0.861,0.204,155.844,0.0596,0.00123,0.789,4.0,0.73,5.0,184806,-7.228,0.876,0.0,Holladay,Ghetto (feat. Young Drummer Boy) [Instrumental],1008.0,21.0,['bedroom soul'],1,0JD6RB1CtMPzO5jn0rWdhL,False,US,2016-04-01,5,79,
2,208750.0,0.637,0.102,128.063,0.0318,0.145,0.0,4.0,0.448,11.0,208813,-6.369,0.343,1.0,Martin Garrix,No Sleep (feat. Bonn),11759320.0,85.0,"['big room', 'edm', 'pop', 'progressive house'...",1,1xc4v8WOttFgzZpkaiVCRz,False,NL,2019-05-10,17,1,42819.0
3,36471.0,0.426,0.106,169.986,0.0416,0.919,0.0673,4.0,0.341,0.0,420012,-11.194,0.187,1.0,Miracle Musical,Dream Sweet in Sea Major,24185.0,50.0,[],1,3RznzRnsl8mzP63l4AF2M7,False,TC,2012,11,79,6606.0
4,1076.0,0.41,0.125,184.015,0.0373,0.0688,9.7e-05,4.0,0.641,0.0,171093,-4.851,0.75,1.0,Plastic Plastic,Gardening,18150.0,60.0,['experimental rock'],1,4k9Ls1K7hKfh2yJSeKJIVX,False,JP,2019-03-20,11,1,274.0


# Categorical Data 

In [90]:
# Turn the True/False data into 1/0
df.explicit = df.explicit.eq(True).mul(1)

In [91]:
# drop two data points for later test case. They are 手写的从前 by Jay Chou and All I Want for Christmas is You
df.loc[df['track_id'] == '7Kmdy3SkmtDWZyAPccrFVd']
df.loc[df['track_id'] == '5YAuUz0Nagt9QxYheiQ9zk']
df = df[df.track_id != '7Kmdy3SkmtDWZyAPccrFVd']
df = df[df.track_id != '5YAuUz0Nagt9QxYheiQ9zk']

# Missing Values
### 1. Drop the data point if certain features are missing

In [92]:
df.dropna(subset=['album_release_date'], inplace=True)
df = df[df.album_release_date != '0000']
df.drop(['artist_genres'], axis=1, inplace = True)

### 2. Impute the average value computed from the whole dataset for the variable artist_followers

In [93]:
df['artist_followers'] = df['artist_followers'].fillna(df['artist_followers'].mean())

# Meta-Data
### Drop some meta-data such as artists' names, track IDs and names of songs.

In [94]:
df.drop(['artist_name'], axis=1, inplace=True)
df.drop(['track_id'], axis=1, inplace=True)
df.drop(['song_name'], axis=1, inplace=True)

# Time Data

### If the month and date is missing, then set it to Janurary 1st.
### If the date is missing, then set it to the first day of that month.

In [95]:
df['album_release_date'] = df['album_release_date'].apply(lambda x: x + '-01-01' if x.find('-') == -1 else x)
df['album_release_date'] = df['album_release_date'].apply(lambda x: x + '-01' if x[-6] != '-' else x)

### Convert the data to datetime type.
### Change the release date to an integer that represents the difference between today and the release date.

In [96]:
df['album_release_date'] = df['album_release_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').date())
df['album_release_date'] = df['album_release_date'].apply(lambda x: datetime.date(datetime.today()) - x)
df['album_release_date'] = pd.to_numeric(df['album_release_date'].dt.days, downcast='integer')

### Shuffle the data

In [97]:
df = shuffle(df)

# Get the dataset without the "country" variable. 
### Please uncomment the following code to run it.

In [98]:
# df.drop('country', inplace=True, axis=1)

# Get the dataset with the "country" variable

In [99]:
# drop the data point if the variable "country" is missing
df.dropna(subset=['country'], inplace=True)
# drop the data point if the country is not alphabet or numbers
df = df[df['country'].str.isalnum()]
df = df.loc[df.country.apply(lambda x: str(x).isalpha())]
# turn the country into uppercase. For example, "us" to "US", "cn" to "CN".
df['country'] = df[['country']].applymap(lambda x: x.upper())
# add a prefix "country_" so that after one-hot encoding we can easily notice that this column is a country.
df['country'] = df[['country']].applymap(lambda x: 'country_' + str(x))
# do one-hot encoding to deal with country, because it is categorical data.
country = pd.get_dummies(df.country)
df[country.columns] = country
# drop the original "country" column after finishing one-hot encoding
df.drop('country', inplace=True, axis=1)

### Do train, development and test set split in a ratio of 7:2:1.
### Sort each dataset in an descending order according to the album release date, so that songs in each data set is ordered by the release date from the oldest to the latest.
### Reset the index in each dataset. drop=True because we do want to insert orginal index into dataframe columns. This resets the index to the default integer index.

In [100]:
train = df.iloc[0:len(df)*7//10]
dev = df.iloc[len(df)*7//10:len(df)*9//10]
test = df.iloc[len(df)*9//10:]

train = pd.DataFrame(train, columns=cols)
train.sort_values(by = ['album_release_date'], ascending=False, inplace = True)
train.reset_index(inplace = True, drop=True)

test = pd.DataFrame(test, columns=cols)
test.sort_values(by = ['album_release_date'], ascending=False, inplace = True)
test.reset_index(inplace = True, drop=True)

dev = pd.DataFrame(dev, columns=cols)
dev.sort_values(by = ['album_release_date'], ascending=False, inplace = True)
dev.reset_index(inplace = True, drop=True)

### Convert the train, development and test sets without countries to .csv files

In [None]:
train.to_csv('../dataset/train_no_countries.csv')
dev.to_csv('../dataset/dev_no_countries.csv')
test.to_csv('../dataset/test_no_countries.csv')

### Convert the train, development and test sets with countries to .csv files

In [None]:
train.to_csv('../dataset/train_with_countries.csv')
dev.to_csv('../dataset/dev_with_countries.csv')
test.to_csv('../dataset/test_with_countries.csv')