# Transcoding Time Regression

***dataset:*** https://archive.ics.uci.edu/ml/datasets/Online+Video+Characteristics+and+Transcoding+Time+Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.utils import column_or_1d
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.utils import check_array

import seaborn as sns

from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

In [2]:
youtube_videos = pd.read_csv('youtube_videos.tsv',sep='\t')
transcoding_measurements = pd.read_csv('transcoding_mesurment.tsv',sep='\t')

## Datasets preview

In [3]:
youtube_videos.head()

Unnamed: 0,id,duration,bitrate,bitrate(video),height,width,frame rate,frame rate(est.),codec,category,url
0,uDNj-_5ty48,267,373,274,568,320,29.97,0.0,h264,Music,http://r2---sn-ovgq0oxu-5goe.c.youtube.com/vid...
1,uDNj-_5ty48,267,512,396,480,270,29.97,29.97,h264,Music,http://r2---sn-ovgq0oxu-5goe.c.youtube.com/vid...
2,uDNj-_5ty48,267,324,263,400,226,29.97,29.97,flv1,Music,http://r2---sn-ovgq0oxu-5goe.c.youtube.com/vid...
3,uDNj-_5ty48,267,85,55,176,144,12.0,12.0,mpeg4,Music,http://r2---sn-ovgq0oxu-5goe.c.youtube.com/vid...
4,WCgt-AactyY,31,1261,1183,640,480,24.0,0.0,h264,People & Blogs,http://r1---sn-ovgq0oxu-5goe.c.youtube.com/vid...


In [4]:
youtube_videos.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168286 entries, 0 to 168285
Data columns (total 11 columns):
id                  168286 non-null object
duration            168286 non-null int64
bitrate             168286 non-null int64
bitrate(video)      168286 non-null int64
height              168286 non-null int64
width               168286 non-null int64
frame rate          168286 non-null float64
frame rate(est.)    168286 non-null float64
codec               168286 non-null object
category            168286 non-null object
url                 168286 non-null object
dtypes: float64(2), int64(5), object(4)
memory usage: 14.1+ MB


In [5]:
transcoding_measurements.head()

Unnamed: 0,id,duration,codec,width,height,bitrate,framerate,i,p,b,...,p_size,b_size,size,o_codec,o_bitrate,o_framerate,o_width,o_height,umem,utime
0,04t6-jw9czg,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,...,825054,0,889537,mpeg4,56000,12.0,176,144,22508,0.612
1,04t6-jw9czg,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,...,825054,0,889537,mpeg4,56000,12.0,320,240,25164,0.98
2,04t6-jw9czg,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,...,825054,0,889537,mpeg4,56000,12.0,480,360,29228,1.216
3,04t6-jw9czg,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,...,825054,0,889537,mpeg4,56000,12.0,640,480,34316,1.692
4,04t6-jw9czg,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,...,825054,0,889537,mpeg4,56000,12.0,1280,720,58528,3.456


In [6]:
transcoding_measurements.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68784 entries, 0 to 68783
Data columns (total 22 columns):
id             68784 non-null object
duration       68784 non-null float64
codec          68784 non-null object
width          68784 non-null int64
height         68784 non-null int64
bitrate        68784 non-null int64
framerate      68784 non-null float64
i              68784 non-null int64
p              68784 non-null int64
b              68784 non-null int64
frames         68784 non-null int64
i_size         68784 non-null int64
p_size         68784 non-null int64
b_size         68784 non-null int64
size           68784 non-null int64
o_codec        68784 non-null object
o_bitrate      68784 non-null int64
o_framerate    68784 non-null float64
o_width        68784 non-null int64
o_height       68784 non-null int64
umem           68784 non-null int64
utime          68784 non-null float64
dtypes: float64(4), int64(15), object(3)
memory usage: 11.5+ MB


In [7]:
X, y = np.split(transcoding_measurements,[-1],axis=1)
y = column_or_1d(y)

In [8]:
to_drop = ['id','umem','duration','frames','size']


In [9]:
X = X.drop(columns=to_drop)
X

Unnamed: 0,codec,width,height,bitrate,framerate,i,p,b,i_size,p_size,b_size,o_codec,o_bitrate,o_framerate,o_width,o_height
0,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,12,176,144
1,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,12,320,240
2,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,12,480,360
3,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,12,640,480
4,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,12,1280,720
5,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,12,1920,1080
6,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,15,176,144
7,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,15,320,240
8,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,15,480,360
9,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,15,640,480


In [10]:
X = pd.get_dummies(X,columns=['codec'])
X = pd.get_dummies(X,columns=['o_codec'])


In [11]:
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X)
X = pd.DataFrame(x_scaled)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
reg = # Put here your predict algoritm

In [14]:
reg.fit(X_train,y_train)

MLPRegressor(activation='tanh', alpha=1e-08, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [15]:
pred = reg.predict(X_test)


In [16]:
mean_squared_error(y_test,pred)

6.109180566260133

In [17]:
mean_absolute_error(y_test,pred)

1.0073695090368882

In [18]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [19]:
mean_absolute_percentage_error(y_test,pred)

13.214650380880109