# Generating rating seq with pandas

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('archive/animelists_cleaned.csv')

In [3]:
df.shape

(31284030, 11)

In [4]:
column_list = df.columns
useful_cols = ['username', 'anime_id', 'my_score', 'my_last_updated']
useless_cols = [col for col in df.columns if col not in useful_cols]

In [5]:
df.drop(columns=useless_cols, inplace=True)

In [6]:
username_list = df.sample(frac=0.0004, replace=True, random_state=0)['username'].unique()

In [7]:
len(username_list)

11155

In [8]:
username_set = set(username_list)

In [9]:
df = df[df['username'].isin(username_set)]

In [10]:
df.shape

(6459844, 4)

In [11]:
df.head(5)

Unnamed: 0,username,anime_id,my_score,my_last_updated
1200,MistButterfly,21,0,2018-04-04 20:55:34
1201,MistButterfly,59,6,2013-12-23 22:27:59
1202,MistButterfly,74,7,2013-04-26 13:54:21
1203,MistButterfly,120,8,2013-04-26 13:54:11
1204,MistButterfly,178,5,2015-08-01 12:50:12


In [12]:
df['time_stamp'] = (pd.to_datetime(df.my_last_updated) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

In [13]:
df.head()

Unnamed: 0,username,anime_id,my_score,my_last_updated,time_stamp
1200,MistButterfly,21,0,2018-04-04 20:55:34,1522875334
1201,MistButterfly,59,6,2013-12-23 22:27:59,1387837679
1202,MistButterfly,74,7,2013-04-26 13:54:21,1366984461
1203,MistButterfly,120,8,2013-04-26 13:54:11,1366984451
1204,MistButterfly,178,5,2015-08-01 12:50:12,1438433412


In [14]:
df.drop(columns=['my_last_updated'], inplace=True)

In [15]:
df = df[df['my_score']>5]

In [16]:
df.shape

(3346868, 4)

In [19]:
df['anime_id'] = df['anime_id'].astype('str')

In [21]:
df_agg = df.sort_values(['time_stamp'],ascending=True).groupby(['username'])['anime_id'].apply(' '.join).reset_index()

In [22]:
df_agg

Unnamed: 0,username,anime_id
0,--kitade--,50 49 303 880 304 53 102 4744 6987 8577 3549 5...
1,-Alberto-,356 6922 11741 10087 14829 20853 17265 527 528...
2,-Amaya--,31859 31043 17265 23321 30831 19815 11757 2188...
3,-Angel-,129 3193 2508 1281 238 289 846 390 170 127 290...
4,-AsakuraShin-,4186 1575 2904 2167 4181 2026 4192 4975 62 521...
...,...,...
10935,zonnikku,1074 22 552 572 874 986 895 905 906 31368 985 ...
10936,zra319,10495 27775 29785 28825 18619 14131 30544 2327...
10937,zwars,306 303 1292 49 50 880 304 101 713 656 1896 47...
10938,zxnno,2167 4181 11757 1691 11111 8841 10790 79 5682 ...


In [23]:
df_agg.to_csv('archive/animelists_sampled_seq.csv')