# Transform path animation labelling data into model data format

In [1]:
import pickle
import os
import pandas as pd
os.chdir('..')
from src.preprocessing.sm_label_transformer import *
from src.features.get_svg_size_pos import get_relative_pos_to_bounding_box_of_animated_paths

### 1. Read data

In [2]:
with open('data/surrogate_model/animation_label.pkl', 'rb') as f:
    animations = pickle.load(f)

In [3]:
X_train = pd.read_csv("data/path_selector/path_selector_train.csv")

X_test = pd.read_csv("data/path_selector/path_selector_test.csv")

# drop features that are not meaningful
X_train.drop(['stroke_width','opacity','stroke_opacity','stroke_r','stroke_g','stroke_b','svg_stroke_r','diff_stroke_r','svg_stroke_g','diff_stroke_g','svg_stroke_b','diff_stroke_b'], axis=1, inplace=True)
X_test.drop(['stroke_width','opacity','stroke_opacity','stroke_r','stroke_g','stroke_b','svg_stroke_r','diff_stroke_r','svg_stroke_g','diff_stroke_g','svg_stroke_b','diff_stroke_b'], axis=1, inplace=True)

X_train.head()

Unnamed: 0,filename,animation_id,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,...,diff_fill_r,svg_fill_g,diff_fill_g,svg_fill_b,diff_fill_b,rel_width,rel_height,rel_x_position,rel_y_position,nr_paths_svg
0,logo_0,1,13.535693,6.971131,-0.009867,-0.915823,-2.984741,5.383109,0.495111,-2.146576,...,-41.6,162.6,-162.6,127.4,-4.4,0.40018,0.472019,0.70009,0.564323,10
1,logo_0,0,-1.000982,4.641413,1.455743,-0.487705,-1.037431,6.984421,-0.485484,-3.933893,...,11.4,162.6,24.4,127.4,-127.4,0.5,0.671553,0.25,0.335776,10
2,logo_0,2,10.590673,0.53337,8.743198,0.241546,-2.132272,0.620374,1.687153,6.592582,...,-2.6,162.6,-46.6,127.4,-123.4,0.787194,0.328324,0.499991,0.835838,10
3,logo_0,3,4.222565,-0.735711,5.308626,-0.09091,-4.875907,2.410124,0.314957,-1.771255,...,-41.6,162.6,-162.6,127.4,-4.4,0.400206,0.472019,0.299897,0.564323,10
4,logo_0,4,1.776277,8.866785,-2.770646,-0.955766,-4.725605,5.412009,0.597616,2.331442,...,-2.6,162.6,-46.6,127.4,-123.4,0.5869,0.315475,0.500106,0.170381,10


In [4]:
animations.head()

Unnamed: 0,file,animation_id,order_id,path_prob,begin_value,model_output,animated_animation_ids,animated_order_ids,backend_mapping,logo_id,animation_number,alias,animation_file,time,label
0,logo_316_animation_0,4,1,0.2,1.0,"[0, 0, 0, 1, 0, 0, -1.0, -1.0, -1.0, -1.0, 0.8...","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,0,Jakob,animation/logo_316_animation_0.svg,"Timestamp(seconds=1617783014, nanoseconds=3260...",Good
1,logo_316_animation_0,24,6,0.6,1.25,"[0, 0, 0, 0, 0, 1, -1.0, -1.0, -1.0, -1.0, -1....","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,0,Jakob,animation/logo_316_animation_0.svg,"Timestamp(seconds=1617783014, nanoseconds=3260...",Good
2,logo_316_animation_0,23,7,0.2,1.5,"[0, 0, 0, 0, 1, 0, -1.0, -1.0, -1.0, -1.0, -1....","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,0,Jakob,animation/logo_316_animation_0.svg,"Timestamp(seconds=1617783014, nanoseconds=3260...",Good
3,logo_316_animation_1,4,1,0.2,1.0,"[1, 0, 0, 0, 0, 0, 0.13436424411240122, 0.8474...","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,1,Jonathan,animation/logo_316_animation_1.svg,"Timestamp(seconds=1617813571, nanoseconds=9700...",Very Bad
4,logo_316_animation_1,24,6,0.6,1.25,"[0, 0, 1, 0, 0, 0, -1.0, -1.0, -1.0, 0.7637746...","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,1,Jonathan,animation/logo_316_animation_1.svg,"Timestamp(seconds=1617813571, nanoseconds=9700...",Okay


### 2. Prepare animation data for merge

In [5]:
# get correct filenames to match
animations['filename'] = animations['file'].apply(lambda row: row.split("_animation")[0])

In [6]:
# Tims feature
# Note: Creating this features takes a few minutes
animations["rel_position_to_animations"] = animations.apply(lambda row: get_relative_pos_to_bounding_box_of_animated_paths(f"data/svgs_preprocessed/{row['filename']}.svg", row["animation_id"], row["animated_animation_ids"]), axis=1)
animations["rel_x_position_to_animations"] = animations["rel_position_to_animations"].apply(lambda row: row[0])
animations["rel_y_position_to_animations"] = animations["rel_position_to_animations"].apply(lambda row: row[1])

data/svgs_preprocessed/logo_394.svg, animation_id 14, animated_animation_ids [14]: rel_y_position not defined and set to 0.5. float division by zero
data/svgs_preprocessed/logo_394.svg, animation_id 14, animated_animation_ids [14]: rel_y_position not defined and set to 0.5. float division by zero
data/svgs_preprocessed/logo_394.svg, animation_id 14, animated_animation_ids [14]: rel_y_position not defined and set to 0.5. float division by zero


In [7]:
# drop unnecessary columns
animations.drop(['file','order_id','path_prob','begin_value','animated_animation_ids','animated_order_ids','backend_mapping','logo_id','animation_number','alias','animation_file', 'time', 'rel_position_to_animations'], axis=1, inplace=True)

# map label names to label indexes and delete entries without rating
mapping = {'Very Bad': 0, 'Bad': 1, 'Okay': 2, 'Good': 3, 'Very Good': 4}
animations.replace({'label': mapping}, inplace=True)
animations = animations[animations['label'] != 'no_rating']
animations.reset_index(drop=True, inplace=True)

In [8]:
animations.head()

Unnamed: 0,animation_id,model_output,label,filename,rel_x_position_to_animations,rel_y_position_to_animations
0,4,"[0, 0, 0, 1, 0, 0, -1.0, -1.0, -1.0, -1.0, 0.8...",3,logo_316,0.039501,0.051404
1,24,"[0, 0, 0, 0, 0, 1, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,0.714309,0.706974
2,23,"[0, 0, 0, 0, 1, 0, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,0.714309,0.778553
3,4,"[1, 0, 0, 0, 0, 0, 0.13436424411240122, 0.8474...",0,logo_316,0.039501,0.051404
4,24,"[0, 0, 1, 0, 0, 0, -1.0, -1.0, -1.0, 0.7637746...",2,logo_316,0.714309,0.706974


In [9]:
animations.label.value_counts()

2    4621
0    3040
1    2531
3    2398
4     509
Name: label, dtype: int64

### 3. Merge animation data with path vectors

In [10]:
train = animations.merge(X_train, how='left', on=['filename', 'animation_id'])
test = animations.merge(X_test, how='left', on=['filename', 'animation_id'])

In [11]:
train.head()

Unnamed: 0,animation_id,model_output,label,filename,rel_x_position_to_animations,rel_y_position_to_animations,emb_0,emb_1,emb_2,emb_3,...,diff_fill_r,svg_fill_g,diff_fill_g,svg_fill_b,diff_fill_b,rel_width,rel_height,rel_x_position,rel_y_position,nr_paths_svg
0,4,"[0, 0, 0, 1, 0, 0, -1.0, -1.0, -1.0, -1.0, 0.8...",3,logo_316,0.039501,0.051404,-12.294147,3.474895,-2.736042,1.6579,...,-4.541667,64.541667,-4.541667,64.541667,-4.541667,0.054752,0.084239,0.033838,0.04212,24.0
1,24,"[0, 0, 0, 0, 0, 1, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,0.714309,0.706974,7.627292,4.582326,7.082139,-5.864271,...,102.458333,64.541667,102.458333,64.541667,102.458333,0.395994,0.362888,0.501511,0.579289,24.0
2,23,"[0, 0, 0, 0, 1, 0, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,0.714309,0.778553,8.7067,3.79783,8.069836,-5.748081,...,102.458333,64.541667,102.458333,64.541667,102.458333,0.395994,0.362904,0.501511,0.63794,24.0
3,4,"[1, 0, 0, 0, 0, 0, 0.13436424411240122, 0.8474...",0,logo_316,0.039501,0.051404,-12.294147,3.474895,-2.736042,1.6579,...,-4.541667,64.541667,-4.541667,64.541667,-4.541667,0.054752,0.084239,0.033838,0.04212,24.0
4,24,"[0, 0, 1, 0, 0, 0, -1.0, -1.0, -1.0, 0.7637746...",2,logo_316,0.714309,0.706974,7.627292,4.582326,7.082139,-5.864271,...,102.458333,64.541667,102.458333,64.541667,102.458333,0.395994,0.362888,0.501511,0.579289,24.0


In [12]:
train.isnull().sum().sort_values(ascending = False)

label                           9127
emb_9                           4121
fill_r                          4121
rel_y_position                  4121
rel_x_position                  4121
rel_height                      4121
rel_width                       4121
diff_fill_b                     4121
svg_fill_b                      4121
diff_fill_g                     4121
svg_fill_g                      4121
diff_fill_r                     4121
svg_fill_r                      4121
fill_b                          4121
fill_g                          4121
nr_paths_svg                    4121
emb_8                           4121
emb_7                           4121
emb_6                           4121
emb_5                           4121
emb_4                           4121
emb_3                           4121
emb_2                           4121
emb_1                           4121
emb_0                           4121
model_output                       0
rel_y_position_to_animations       0
r

In [13]:
print(f"Before: Train: {train.shape}. Test: {test.shape}")
train.dropna(inplace=True)
test.dropna(inplace=True)
print(f"After: Train: {train.shape}. Test: {test.shape}")

Before: Train: (22227, 30). Test: (22224, 30)
After: Train: (10631, 30). Test: (2469, 30)


### 4. Transform animation vector into multiple dataframe columns and change column ordering

In [18]:
train[[f'an_vec_{i}' for i in range(12)]] = pd.DataFrame(train['model_output'].tolist(), index=train.index)
test[[f'an_vec_{i}' for i in range(12)]] = pd.DataFrame(test['model_output'].tolist(), index=test.index)

In [19]:
train.drop(['model_output','animation_id','filename'], inplace=True, axis=1, errors='ignore')
test.drop(['model_output','animation_id','filename'], inplace=True, axis=1, errors='ignore')

In [20]:
train.head()

Unnamed: 0,label,rel_x_position_to_animations,rel_y_position_to_animations,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,...,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,an_vec_10,an_vec_11
0,3,0.039501,0.051404,-12.294147,3.474895,-2.736042,1.6579,0.919059,-0.791849,3.824877,...,0,1,0,0,-1.0,-1.0,-1.0,-1.0,0.844422,0.757954
1,3,0.714309,0.706974,7.627292,4.582326,7.082139,-5.864271,-2.27915,6.263388,1.082239,...,0,0,0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,3,0.714309,0.778553,8.7067,3.79783,8.069836,-5.748081,-1.975505,6.366743,1.201885,...,0,0,1,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,0,0.039501,0.051404,-12.294147,3.474895,-2.736042,1.6579,0.919059,-0.791849,3.824877,...,0,0,0,0,0.134364,0.847434,-1.0,-1.0,-1.0,-1.0
4,2,0.714309,0.706974,7.627292,4.582326,7.082139,-5.864271,-2.27915,6.263388,1.082239,...,1,0,0,0,-1.0,-1.0,-1.0,0.763775,-1.0,-1.0


In [21]:
col_order = [f'an_vec_{i}' for i in range(12)] + [f'emb_{i}' for i in range(10)] + ['_'.join(['fill', ch]) for ch in ['r','g','b']] + ['_'.join(['svg_fill', ch]) for ch in ['r','g','b']] + ['_'.join(['diff_fill', ch]) for ch in ['r','g','b']] + ['rel_height','rel_width','rel_x_position','rel_y_position','rel_x_position_to_animations','rel_y_position_to_animations','nr_paths_svg','label']

In [22]:
train = train[col_order]
test = test[col_order]

In [23]:
train.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,diff_fill_g,diff_fill_b,rel_height,rel_width,rel_x_position,rel_y_position,rel_x_position_to_animations,rel_y_position_to_animations,nr_paths_svg,label
0,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,...,-4.541667,-4.541667,0.084239,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,3
1,0,0,0,0,0,1,-1.0,-1.0,-1.0,-1.0,...,102.458333,102.458333,0.362888,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,3
2,0,0,0,0,1,0,-1.0,-1.0,-1.0,-1.0,...,102.458333,102.458333,0.362904,0.395994,0.501511,0.63794,0.714309,0.778553,24.0,3
3,1,0,0,0,0,0,0.134364,0.847434,-1.0,-1.0,...,-4.541667,-4.541667,0.084239,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,0
4,0,0,1,0,0,0,-1.0,-1.0,-1.0,0.763775,...,102.458333,102.458333,0.362888,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,2


### 5. Encode labels into 4-binary labels

In [24]:
train[[f'rating_{i}' for i in range(4)]] = pd.DataFrame(encode_classes(np.array(train['label'])), index=train.index)
test[[f'rating_{i}' for i in range(4)]] = pd.DataFrame(encode_classes(np.array(test['label'])), index=test.index)

train.drop(['label'], axis=1, inplace=True)
test.drop(['label'], axis=1, inplace=True)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [25]:
train.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,rel_width,rel_x_position,rel_y_position,rel_x_position_to_animations,rel_y_position_to_animations,nr_paths_svg,rating_0,rating_1,rating_2,rating_3
0,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,...,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,1,1,1,0
1,0,0,0,0,0,1,-1.0,-1.0,-1.0,-1.0,...,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,1,1,1,0
2,0,0,0,0,1,0,-1.0,-1.0,-1.0,-1.0,...,0.395994,0.501511,0.63794,0.714309,0.778553,24.0,1,1,1,0
3,1,0,0,0,0,0,0.134364,0.847434,-1.0,-1.0,...,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,0,0,0,0
4,0,0,1,0,0,0,-1.0,-1.0,-1.0,0.763775,...,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,1,1,0,0


### 6. Save data

In [23]:
train.to_csv('data/surrogate_model/sm_train.csv', index=False)
test.to_csv('data/surrogate_model/sm_test.csv', index=False)