# Transform path animation labeling data into model data format

In [1]:
import pickle
import os
import pandas as pd
os.chdir('..')
from src.preprocessing.sm_label_transformer import *
from src.features.get_svg_size_pos import get_relative_pos_to_bounding_box_of_animated_paths



### 1. Read data

In [3]:
with open('data/surrogate_model/sm_animation_vectors_label.pkl', 'rb') as f:
    animations = pickle.load(f)

In [4]:
X_train = pd.read_csv("data/model_1/model_1_train.csv")
X_train = X_train.drop("Unnamed: 0", axis=1) # Delete unneccessary columns which was copied as indice

X_test = pd.read_csv("data/model_1/model_1_test.csv")
X_test = X_test.drop("Unnamed: 0", axis=1) # Delete unneccessary columns which was copied as indice

# drop features that are not meaningful
X_train.drop(['stroke_width','opacity','stroke_opacity','stroke_r','stroke_g','stroke_b','svg_stroke_r','diff_stroke_r','svg_stroke_g','diff_stroke_g','svg_stroke_b','diff_stroke_b', 'href'], axis=1, inplace=True)
X_test.drop(['stroke_width','opacity','stroke_opacity','stroke_r','stroke_g','stroke_b','svg_stroke_r','diff_stroke_r','svg_stroke_g','diff_stroke_g','svg_stroke_b','diff_stroke_b', 'href'], axis=1, inplace=True)

X_train.head()

Unnamed: 0,filename,animation_id,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,...,diff_fill_r,svg_fill_g,diff_fill_g,svg_fill_b,diff_fill_b,rel_width,rel_height,rel_x_position,rel_y_position,nr_paths_svg
0,logo_0,1,11.267385,10.743526,-1.62129,-1.210992,0.682024,4.925882,2.255312,0.925018,...,-41.6,162.6,-162.6,127.4,-4.4,0.391017,0.458231,0.706957,0.577049,10
1,logo_0,0,-4.996587,1.684802,2.679946,0.422735,2.906136,5.928584,1.541004,-0.106497,...,11.4,162.6,24.4,127.4,-127.4,0.488551,0.651937,0.267174,0.355178,10
2,logo_0,3,8.274405,-0.899303,4.046902,0.713516,-2.461293,3.281322,1.947822,0.472688,...,-41.6,162.6,-162.6,127.4,-4.4,0.391042,0.458231,0.315928,0.577049,10
3,logo_0,2,11.903552,2.669469,3.759462,0.42802,-2.520246,0.311202,1.855791,5.763235,...,-2.6,162.6,-46.6,127.4,-123.4,0.769168,0.318733,0.511441,0.840633,10
4,logo_0,5,8.466207,14.569347,2.594367,-1.336301,-3.24835,0.974781,1.478868,1.843358,...,11.4,162.6,24.4,127.4,-127.4,0.488551,0.651931,0.755725,0.355181,10


In [5]:
animations.head()

Unnamed: 0,file,animation_id,order_id,path_prob,begin_value,model_output,animated_animation_ids,animated_order_ids,backend_mapping,logo_id,animation_number,alias,animation_file,time,label
0,logo_316_animation_0,4,1,0.2,1.0,"[0, 0, 0, 1, 0, 0, -1.0, -1.0, -1.0, -1.0, 0.8...","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,0,Jakob,animation/logo_316_animation_0.svg,"Timestamp(seconds=1617783014, nanoseconds=3260...",Good
1,logo_316_animation_0,24,6,0.6,1.25,"[0, 0, 0, 0, 0, 1, -1.0, -1.0, -1.0, -1.0, -1....","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,0,Jakob,animation/logo_316_animation_0.svg,"Timestamp(seconds=1617783014, nanoseconds=3260...",Good
2,logo_316_animation_0,23,7,0.2,1.5,"[0, 0, 0, 0, 1, 0, -1.0, -1.0, -1.0, -1.0, -1....","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,0,Jakob,animation/logo_316_animation_0.svg,"Timestamp(seconds=1617783014, nanoseconds=3260...",Good
3,logo_316_animation_1,4,1,0.2,1.0,"[1, 0, 0, 0, 0, 0, 0.13436424411240122, 0.8474...","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,1,Jonathan,animation/logo_316_animation_1.svg,"Timestamp(seconds=1617813571, nanoseconds=9700...",Very Bad
4,logo_316_animation_1,24,6,0.6,1.25,"[0, 0, 1, 0, 0, 0, -1.0, -1.0, -1.0, 0.7637746...","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,1,Jonathan,animation/logo_316_animation_1.svg,"Timestamp(seconds=1617813571, nanoseconds=9700...",Okay


### 2. Prepare animation data for merge

In [6]:
# get correct filenames to match
animations['filename'] = animations['file'].apply(lambda row: row.split("_animation")[0])

In [7]:
# Tims feature
animations["rel_position_to_animations"] = animations.apply(lambda row: get_relative_pos_to_bounding_box_of_animated_paths(f"data/initial_svgs/{row['filename']}.svg", row["animation_id"], row["animated_animation_ids"]), axis=1)
animations["rel_x_position_to_animations"] = animations["rel_position_to_animations"].apply(lambda row: row[0])
animations["rel_y_position_to_animations"] = animations["rel_position_to_animations"].apply(lambda row: row[1])

data/initial_svgs/logo_394.svg, animation_id 14, animated_animation_ids [14]: rel_y_position not defined and set to 0.5. float division by zero
data/initial_svgs/logo_394.svg, animation_id 14, animated_animation_ids [14]: rel_y_position not defined and set to 0.5. float division by zero
data/initial_svgs/logo_394.svg, animation_id 14, animated_animation_ids [14]: rel_y_position not defined and set to 0.5. float division by zero


In [8]:
# drop unnecessary columns
animations.drop(['file','order_id','path_prob','begin_value','animated_animation_ids','animated_order_ids','backend_mapping','logo_id','animation_number','alias','animation_file', 'time', 'rel_position_to_animations'], axis=1, inplace=True)

# map label names to label indexes and delete entries without rating
mapping = {'Very Bad': 0, 'Bad': 1, 'Okay': 2, 'Good': 3, 'Very Good': 4}
animations.replace({'label': mapping}, inplace=True)
animations = animations[animations['label'] != 'no_rating']
animations.reset_index(drop=True, inplace=True)

In [9]:
animations.head()

Unnamed: 0,animation_id,model_output,label,filename,rel_x_position_to_animations,rel_y_position_to_animations
0,4,"[0, 0, 0, 1, 0, 0, -1.0, -1.0, -1.0, -1.0, 0.8...",3,logo_316,0.039501,0.051404
1,24,"[0, 0, 0, 0, 0, 1, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,0.714309,0.706974
2,23,"[0, 0, 0, 0, 1, 0, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,0.714309,0.778553
3,4,"[1, 0, 0, 0, 0, 0, 0.13436424411240122, 0.8474...",0,logo_316,0.039501,0.051404
4,24,"[0, 0, 1, 0, 0, 0, -1.0, -1.0, -1.0, 0.7637746...",2,logo_316,0.714309,0.706974


In [10]:
animations.label.value_counts()

2    1859
3    1472
0    1419
1    1406
4     390
Name: label, dtype: int64

### 3. Merge animation data with path vectors

In [10]:
train = animations.merge(X_train, how='left', on=['filename', 'animation_id'])
test = animations.merge(X_test, how='left', on=['filename', 'animation_id'])

In [11]:
train.head()

Unnamed: 0,animation_id,model_output,label,filename,rel_x_position_to_animations,rel_y_position_to_animations,emb_0,emb_1,emb_2,emb_3,...,diff_fill_r,svg_fill_g,diff_fill_g,svg_fill_b,diff_fill_b,rel_width,rel_height,rel_x_position,rel_y_position,nr_paths_svg
0,4,"[0, 0, 0, 1, 0, 0, -1.0, -1.0, -1.0, -1.0, 0.8...",3,logo_316,0.039501,0.051404,-13.020584,4.711698,-0.984884,1.552102,...,-4.541667,64.541667,-4.541667,64.541667,-4.541667,0.054752,0.084239,0.033838,0.04212,24.0
1,24,"[0, 0, 0, 0, 0, 1, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,0.714309,0.706974,9.318763,1.599212,9.305925,-3.920086,...,102.458333,64.541667,102.458333,64.541667,102.458333,0.395994,0.362888,0.501511,0.579289,24.0
2,23,"[0, 0, 0, 0, 1, 0, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,0.714309,0.778553,9.063248,1.377913,9.541101,-3.860107,...,102.458333,64.541667,102.458333,64.541667,102.458333,0.395994,0.362904,0.501511,0.63794,24.0
3,4,"[1, 0, 0, 0, 0, 0, 0.13436424411240122, 0.8474...",0,logo_316,0.039501,0.051404,-13.020584,4.711698,-0.984884,1.552102,...,-4.541667,64.541667,-4.541667,64.541667,-4.541667,0.054752,0.084239,0.033838,0.04212,24.0
4,24,"[0, 0, 1, 0, 0, 0, -1.0, -1.0, -1.0, 0.7637746...",2,logo_316,0.714309,0.706974,9.318763,1.599212,9.305925,-3.920086,...,102.458333,64.541667,102.458333,64.541667,102.458333,0.395994,0.362888,0.501511,0.579289,24.0


In [12]:
train.isnull().sum().sort_values(ascending = False)

label                           16938
nr_paths_svg                     4309
fill_r                           4309
emb_0                            4309
emb_1                            4309
emb_2                            4309
emb_3                            4309
emb_4                            4309
emb_5                            4309
emb_6                            4309
emb_7                            4309
rel_y_position                   4309
emb_9                            4309
emb_8                            4309
fill_g                           4309
fill_b                           4309
svg_fill_r                       4309
diff_fill_r                      4309
svg_fill_g                       4309
diff_fill_g                      4309
svg_fill_b                       4309
diff_fill_b                      4309
rel_width                        4309
rel_height                       4309
rel_x_position                   4309
rel_y_position_to_animations        0
rel_x_positi

In [13]:
print(f"Before: Train: {train.shape}. Test: {test.shape}")
train.dropna(inplace=True)
test.dropna(inplace=True)
print(f"After: Train: {train.shape}. Test: {test.shape}")

Before: Train: (23485, 30). Test: (23481, 30)
After: Train: (5309, 30). Test: (1238, 30)


### 4. Transform animation vector into multiple dataframe columns and change column ordering

In [14]:
train[[f'an_vec_{i}' for i in range(12)]] = pd.DataFrame(train['model_output'].tolist(), index=train.index)
test[[f'an_vec_{i}' for i in range(12)]] = pd.DataFrame(test['model_output'].tolist(), index=test.index)

train.drop(['model_output','animation_id','filename'], inplace=True, axis=1)
test.drop(['model_output','animation_id','filename'], inplace=True, axis=1)

In [15]:
train.head()

Unnamed: 0,label,rel_x_position_to_animations,rel_y_position_to_animations,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,...,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,an_vec_10,an_vec_11
0,3,0.039501,0.051404,-13.020584,4.711698,-0.984884,1.552102,1.859648,-2.463904,3.327401,...,0,1,0,0,-1.0,-1.0,-1.0,-1.0,0.844422,0.757954
1,3,0.714309,0.706974,9.318763,1.599212,9.305925,-3.920086,2.013265,5.833629,3.072608,...,0,0,0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,3,0.714309,0.778553,9.063248,1.377913,9.541101,-3.860107,2.056594,5.885443,3.10896,...,0,0,1,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,0,0.039501,0.051404,-13.020584,4.711698,-0.984884,1.552102,1.859648,-2.463904,3.327401,...,0,0,0,0,0.134364,0.847434,-1.0,-1.0,-1.0,-1.0
4,2,0.714309,0.706974,9.318763,1.599212,9.305925,-3.920086,2.013265,5.833629,3.072608,...,1,0,0,0,-1.0,-1.0,-1.0,0.763775,-1.0,-1.0


In [16]:
col_order = [f'an_vec_{i}' for i in range(12)] + [f'emb_{i}' for i in range(10)] + ['_'.join(['fill', ch]) for ch in ['r','g','b']] + ['_'.join(['svg_fill', ch]) for ch in ['r','g','b']] + ['_'.join(['diff_fill', ch]) for ch in ['r','g','b']] + ['rel_height','rel_width','rel_x_position','rel_y_position','rel_x_position_to_animations','rel_y_position_to_animations','nr_paths_svg','label']

In [17]:
train = train[col_order]
test = test[col_order]

In [18]:
train.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,diff_fill_g,diff_fill_b,rel_height,rel_width,rel_x_position,rel_y_position,rel_x_position_to_animations,rel_y_position_to_animations,nr_paths_svg,label
0,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,...,-4.541667,-4.541667,0.084239,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,3
1,0,0,0,0,0,1,-1.0,-1.0,-1.0,-1.0,...,102.458333,102.458333,0.362888,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,3
2,0,0,0,0,1,0,-1.0,-1.0,-1.0,-1.0,...,102.458333,102.458333,0.362904,0.395994,0.501511,0.63794,0.714309,0.778553,24.0,3
3,1,0,0,0,0,0,0.134364,0.847434,-1.0,-1.0,...,-4.541667,-4.541667,0.084239,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,0
4,0,0,1,0,0,0,-1.0,-1.0,-1.0,0.763775,...,102.458333,102.458333,0.362888,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,2


### 5. Encode labels into 4-binary labels

In [19]:
train[[f'rating_{i}' for i in range(4)]] = pd.DataFrame(encode_classes(np.array(train['label'])), index=train.index)
test[[f'rating_{i}' for i in range(4)]] = pd.DataFrame(encode_classes(np.array(test['label'])), index=test.index)

train.drop(['label'], axis=1, inplace=True)
test.drop(['label'], axis=1, inplace=True)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [20]:
train.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,rel_width,rel_x_position,rel_y_position,rel_x_position_to_animations,rel_y_position_to_animations,nr_paths_svg,rating_0,rating_1,rating_2,rating_3
0,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,...,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,1,1,1,0
1,0,0,0,0,0,1,-1.0,-1.0,-1.0,-1.0,...,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,1,1,1,0
2,0,0,0,0,1,0,-1.0,-1.0,-1.0,-1.0,...,0.395994,0.501511,0.63794,0.714309,0.778553,24.0,1,1,1,0
3,1,0,0,0,0,0,0.134364,0.847434,-1.0,-1.0,...,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,0,0,0,0
4,0,0,1,0,0,0,-1.0,-1.0,-1.0,0.763775,...,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,1,1,0,0


### 6. Save data

In [21]:
train.to_csv('data/fitness_function/train_ff_new.csv', index=False)
test.to_csv('data/fitness_function/test_ff_new.csv', index=False)