# Transform path animation labeling data into model data format

In [1]:
import pickle
import os
import pandas as pd
os.chdir('..')
from src.preprocessing.transform_into_model_data_ff import *

### 1. Read data

In [12]:
with open('data/surrogate_model/animation_label.pkl', 'rb') as f:
    animations = pickle.load(f)

In [13]:
X_train = pd.read_csv("data/model_1/model_1_train.csv")
X_train = X_train.drop("Unnamed: 0", axis=1) # Delete unneccessary columns which was copied as indice

X_test = pd.read_csv("data/model_1/model_1_test.csv")
X_test = X_test.drop("Unnamed: 0", axis=1) # Delete unneccessary columns which was copied as indice

# drop features that are not meaningful
X_train.drop(['stroke_width','opacity','stroke_opacity','stroke_r','stroke_g','stroke_b','svg_stroke_r','diff_stroke_r','svg_stroke_g','diff_stroke_g','svg_stroke_b','diff_stroke_b'], axis=1, inplace=True)
X_test.drop(['stroke_width','opacity','stroke_opacity','stroke_r','stroke_g','stroke_b','svg_stroke_r','diff_stroke_r','svg_stroke_g','diff_stroke_g','svg_stroke_b','diff_stroke_b'], axis=1, inplace=True)

X_train.head()

Unnamed: 0,filename,animation_id,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,...,diff_fill_r,svg_fill_g,diff_fill_g,svg_fill_b,diff_fill_b,rel_width,rel_height,rel_x_position,rel_y_position,nr_paths_svg
0,logo_0,1,11.268934,10.784905,-1.795118,-1.315936,0.699395,4.745064,2.583316,0.997763,...,-41.6,162.6,-162.6,127.4,-4.4,0.391017,0.458231,0.706957,0.577049,10
1,logo_0,0,-4.928805,1.69389,2.650219,0.41788,2.906058,5.822047,1.973111,-0.043525,...,11.4,162.6,24.4,127.4,-127.4,0.488551,0.651937,0.267174,0.355178,10
2,logo_0,3,8.363034,-0.790123,4.012104,0.711058,-2.457028,3.136327,2.176535,0.519116,...,-41.6,162.6,-162.6,127.4,-4.4,0.391042,0.458231,0.315928,0.577049,10
3,logo_0,2,11.966211,2.788607,3.678947,0.402484,-2.509481,0.135113,1.815825,5.770873,...,-2.6,162.6,-46.6,127.4,-123.4,0.769168,0.318733,0.511441,0.840633,10
4,logo_0,5,8.457268,14.647492,2.389777,-1.438775,-3.22515,0.856066,1.528019,1.883333,...,11.4,162.6,24.4,127.4,-127.4,0.488551,0.651931,0.755725,0.355181,10


In [14]:
animations.head()

Unnamed: 0,file,animation_id,order_id,path_prob,begin_value,model_output,animated_animation_ids,animated_order_ids,backend_mapping,logo_id,animation_number,alias,animation_file,time,label
0,logo_316_animation_0,4,1,0.2,1.0,"[0, 0, 0, 1, 0, 0, -1.0, -1.0, -1.0, -1.0, 0.8...","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,0,Jakob,animation/logo_316_animation_0.svg,"Timestamp(seconds=1617783014, nanoseconds=3260...",Good
1,logo_316_animation_0,24,6,0.6,1.25,"[0, 0, 0, 0, 0, 1, -1.0, -1.0, -1.0, -1.0, -1....","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,0,Jakob,animation/logo_316_animation_0.svg,"Timestamp(seconds=1617783014, nanoseconds=3260...",Good
2,logo_316_animation_0,23,7,0.2,1.5,"[0, 0, 0, 0, 1, 0, -1.0, -1.0, -1.0, -1.0, -1....","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,0,Jakob,animation/logo_316_animation_0.svg,"Timestamp(seconds=1617783014, nanoseconds=3260...",Good
3,logo_316_animation_1,4,1,0.2,1.0,"[1, 0, 0, 0, 0, 0, 0.13436424411240122, 0.8474...","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,1,Jonathan,animation/logo_316_animation_1.svg,"Timestamp(seconds=1617813571, nanoseconds=9700...",Very Bad
4,logo_316_animation_1,24,6,0.6,1.25,"[0, 0, 1, 0, 0, 0, -1.0, -1.0, -1.0, 0.7637746...","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,1,Jonathan,animation/logo_316_animation_1.svg,"Timestamp(seconds=1617813571, nanoseconds=9700...",Okay


### 2. Prepare animation data for merge

In [15]:
# get correct filenames to match
animations['filename'] = animations['file'].apply(lambda row: row.split("_animation")[0])

# drop unnecessary columns
animations.drop(['file','order_id','path_prob','begin_value','animated_animation_ids','animated_order_ids','backend_mapping','logo_id','animation_number','alias','animation_file','time'], axis=1, inplace=True)

# map label names to label indexes and delete entries without rating
mapping = {'Very Bad': 0, 'Bad': 1, 'Okay': 2, 'Good': 3, 'Very Good': 4}
animations.replace({'label': mapping}, inplace=True)
animations = animations[animations['label'] != 'no_rating']
animations.reset_index(drop=True, inplace=True)

In [16]:
animations.head()

Unnamed: 0,animation_id,model_output,label,filename
0,4,"[0, 0, 0, 1, 0, 0, -1.0, -1.0, -1.0, -1.0, 0.8...",3,logo_316
1,24,"[0, 0, 0, 0, 0, 1, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316
2,23,"[0, 0, 0, 0, 1, 0, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316
3,4,"[1, 0, 0, 0, 0, 0, 0.13436424411240122, 0.8474...",0,logo_316
4,24,"[0, 0, 1, 0, 0, 0, -1.0, -1.0, -1.0, 0.7637746...",2,logo_316


In [17]:
animations.label.value_counts()

2    1859
3    1472
0    1419
1    1406
4     390
Name: label, dtype: int64

### 3. Merge animation data with path vectors

In [18]:
train = animations.merge(X_train, how='left', on=['filename', 'animation_id'])
test = animations.merge(X_test, how='left', on=['filename', 'animation_id'])

In [19]:
train.head()

Unnamed: 0,animation_id,model_output,label,filename,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,...,diff_fill_r,svg_fill_g,diff_fill_g,svg_fill_b,diff_fill_b,rel_width,rel_height,rel_x_position,rel_y_position,nr_paths_svg
0,4,"[0, 0, 0, 1, 0, 0, -1.0, -1.0, -1.0, -1.0, 0.8...",3,logo_316,-12.980438,4.639808,-1.026289,1.529357,1.860201,-2.682314,...,-24.541667,24.541667,-24.541667,24.541667,-24.541667,0.054751,0.091875,0.033837,-0.044965,24.0
1,24,"[0, 0, 0, 0, 0, 1, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,9.401122,1.744369,9.253009,-3.922779,2.027758,5.604469,...,142.458333,24.541667,142.458333,24.541667,142.458333,0.395992,0.395781,0.501509,0.540894,24.0
2,23,"[0, 0, 0, 0, 1, 0, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,9.147661,1.52477,9.491141,-3.86013,2.07084,5.654076,...,142.458333,24.541667,142.458333,24.541667,142.458333,0.395992,0.395799,0.501509,0.604861,24.0
3,4,"[1, 0, 0, 0, 0, 0, 0.13436424411240122, 0.8474...",0,logo_316,-12.980438,4.639808,-1.026289,1.529357,1.860201,-2.682314,...,-24.541667,24.541667,-24.541667,24.541667,-24.541667,0.054751,0.091875,0.033837,-0.044965,24.0
4,24,"[0, 0, 1, 0, 0, 0, -1.0, -1.0, -1.0, 0.7637746...",2,logo_316,9.401122,1.744369,9.253009,-3.922779,2.027758,5.604469,...,142.458333,24.541667,142.458333,24.541667,142.458333,0.395992,0.395781,0.501509,0.540894,24.0


In [20]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [21]:
train.shape

(5281, 28)

In [22]:
test.shape

(1238, 28)

### 4. Transform animation vector into multiple dataframe columns and change column ordering

In [24]:
train[[f'an_vec_{i}' for i in range(12)]] = pd.DataFrame(train['model_output'].tolist(), index=train.index)
test[[f'an_vec_{i}' for i in range(12)]] = pd.DataFrame(test['model_output'].tolist(), index=test.index)

train.drop(['model_output','animation_id','filename'], inplace=True, axis=1)
test.drop(['model_output','animation_id','filename'], inplace=True, axis=1)

In [25]:
train.head()

Unnamed: 0,label,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,an_vec_10,an_vec_11
0,3,-12.980438,4.639808,-1.026289,1.529357,1.860201,-2.682314,3.155013,-0.3676,0.620792,...,0,1,0,0,-1.0,-1.0,-1.0,-1.0,0.844422,0.757954
1,3,9.401122,1.744369,9.253009,-3.922779,2.027758,5.604469,3.467158,-0.309604,-0.142178,...,0,0,0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,3,9.147661,1.52477,9.491141,-3.86013,2.07084,5.654076,3.507531,-0.308374,-0.12133,...,0,0,1,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,0,-12.980438,4.639808,-1.026289,1.529357,1.860201,-2.682314,3.155013,-0.3676,0.620792,...,0,0,0,0,0.134364,0.847434,-1.0,-1.0,-1.0,-1.0
4,2,9.401122,1.744369,9.253009,-3.922779,2.027758,5.604469,3.467158,-0.309604,-0.142178,...,1,0,0,0,-1.0,-1.0,-1.0,0.763775,-1.0,-1.0


In [26]:
col_order = [f'an_vec_{i}' for i in range(12)] + [f'emb_{i}' for i in range(10)] + ['_'.join(['fill', ch]) for ch in ['r','g','b']] + ['_'.join(['svg_fill', ch]) for ch in ['r','g','b']] + ['_'.join(['diff_fill', ch]) for ch in ['r','g','b']] + ['rel_height','rel_width','rel_x_position','rel_y_position','nr_paths_svg','label']

In [27]:
train = train[col_order]
test = test[col_order]

In [28]:
train.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,svg_fill_b,diff_fill_r,diff_fill_g,diff_fill_b,rel_height,rel_width,rel_x_position,rel_y_position,nr_paths_svg,label
0,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,...,24.541667,-24.541667,-24.541667,-24.541667,0.091875,0.054751,0.033837,-0.044965,24.0,3
1,0,0,0,0,0,1,-1.0,-1.0,-1.0,-1.0,...,24.541667,142.458333,142.458333,142.458333,0.395781,0.395992,0.501509,0.540894,24.0,3
2,0,0,0,0,1,0,-1.0,-1.0,-1.0,-1.0,...,24.541667,142.458333,142.458333,142.458333,0.395799,0.395992,0.501509,0.604861,24.0,3
3,1,0,0,0,0,0,0.134364,0.847434,-1.0,-1.0,...,24.541667,-24.541667,-24.541667,-24.541667,0.091875,0.054751,0.033837,-0.044965,24.0,0
4,0,0,1,0,0,0,-1.0,-1.0,-1.0,0.763775,...,24.541667,142.458333,142.458333,142.458333,0.395781,0.395992,0.501509,0.540894,24.0,2


### 5. Encode labels into 4-binary labels

In [29]:
train[[f'rating_{i}' for i in range(4)]] = pd.DataFrame(encode_classes(np.array(train['label'])), index=train.index)
test[[f'rating_{i}' for i in range(4)]] = pd.DataFrame(encode_classes(np.array(test['label'])), index=test.index)

train.drop(['label'], axis=1, inplace=True)
test.drop(['label'], axis=1, inplace=True)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [30]:
train.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,diff_fill_b,rel_height,rel_width,rel_x_position,rel_y_position,nr_paths_svg,rating_0,rating_1,rating_2,rating_3
0,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,...,-24.541667,0.091875,0.054751,0.033837,-0.044965,24.0,1,1,1,0
1,0,0,0,0,0,1,-1.0,-1.0,-1.0,-1.0,...,142.458333,0.395781,0.395992,0.501509,0.540894,24.0,1,1,1,0
2,0,0,0,0,1,0,-1.0,-1.0,-1.0,-1.0,...,142.458333,0.395799,0.395992,0.501509,0.604861,24.0,1,1,1,0
3,1,0,0,0,0,0,0.134364,0.847434,-1.0,-1.0,...,-24.541667,0.091875,0.054751,0.033837,-0.044965,24.0,0,0,0,0
4,0,0,1,0,0,0,-1.0,-1.0,-1.0,0.763775,...,142.458333,0.395781,0.395992,0.501509,0.540894,24.0,1,1,0,0


### 6. Save data

In [31]:
train.to_csv('data/fitness_function/train_ff.csv', index=False)
test.to_csv('data/fitness_function/test_ff.csv', index=False)