# Transcoding Time Regression

***dataset:*** https://archive.ics.uci.edu/ml/datasets/Online+Video+Characteristics+and+Transcoding+Time+Dataset

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.utils import column_or_1d
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.utils import check_array
from sklearn.linear_model import Ridge
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from  sklearn.neighbors  import  KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import GradientBoostingRegressor

In [51]:
youtube_videos = pd.read_csv('youtube_videos.tsv',sep='\t')
transcoding_measurements = pd.read_csv('transcoding_mesurment.tsv',sep='\t')

## Datasets preview

In [52]:
youtube_videos.head()

Unnamed: 0,id,duration,bitrate,bitrate(video),height,width,frame rate,frame rate(est.),codec,category,url
0,uDNj-_5ty48,267,373,274,568,320,29.97,0.0,h264,Music,http://r2---sn-ovgq0oxu-5goe.c.youtube.com/vid...
1,uDNj-_5ty48,267,512,396,480,270,29.97,29.97,h264,Music,http://r2---sn-ovgq0oxu-5goe.c.youtube.com/vid...
2,uDNj-_5ty48,267,324,263,400,226,29.97,29.97,flv1,Music,http://r2---sn-ovgq0oxu-5goe.c.youtube.com/vid...
3,uDNj-_5ty48,267,85,55,176,144,12.0,12.0,mpeg4,Music,http://r2---sn-ovgq0oxu-5goe.c.youtube.com/vid...
4,WCgt-AactyY,31,1261,1183,640,480,24.0,0.0,h264,People & Blogs,http://r1---sn-ovgq0oxu-5goe.c.youtube.com/vid...


In [53]:
youtube_videos.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168286 entries, 0 to 168285
Data columns (total 11 columns):
id                  168286 non-null object
duration            168286 non-null int64
bitrate             168286 non-null int64
bitrate(video)      168286 non-null int64
height              168286 non-null int64
width               168286 non-null int64
frame rate          168286 non-null float64
frame rate(est.)    168286 non-null float64
codec               168286 non-null object
category            168286 non-null object
url                 168286 non-null object
dtypes: float64(2), int64(5), object(4)
memory usage: 14.1+ MB


In [54]:
transcoding_measurements.head()

Unnamed: 0,id,duration,codec,width,height,bitrate,framerate,i,p,b,...,p_size,b_size,size,o_codec,o_bitrate,o_framerate,o_width,o_height,umem,utime
0,04t6-jw9czg,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,...,825054,0,889537,mpeg4,56000,12.0,176,144,22508,0.612
1,04t6-jw9czg,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,...,825054,0,889537,mpeg4,56000,12.0,320,240,25164,0.98
2,04t6-jw9czg,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,...,825054,0,889537,mpeg4,56000,12.0,480,360,29228,1.216
3,04t6-jw9czg,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,...,825054,0,889537,mpeg4,56000,12.0,640,480,34316,1.692
4,04t6-jw9czg,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,...,825054,0,889537,mpeg4,56000,12.0,1280,720,58528,3.456


In [55]:
transcoding_measurements.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68784 entries, 0 to 68783
Data columns (total 22 columns):
id             68784 non-null object
duration       68784 non-null float64
codec          68784 non-null object
width          68784 non-null int64
height         68784 non-null int64
bitrate        68784 non-null int64
framerate      68784 non-null float64
i              68784 non-null int64
p              68784 non-null int64
b              68784 non-null int64
frames         68784 non-null int64
i_size         68784 non-null int64
p_size         68784 non-null int64
b_size         68784 non-null int64
size           68784 non-null int64
o_codec        68784 non-null object
o_bitrate      68784 non-null int64
o_framerate    68784 non-null float64
o_width        68784 non-null int64
o_height       68784 non-null int64
umem           68784 non-null int64
utime          68784 non-null float64
dtypes: float64(4), int64(15), object(3)
memory usage: 11.5+ MB


In [56]:
X, y = np.split(transcoding_measurements,[-1],axis=1)
y = column_or_1d(y)

In [57]:
to_drop = ['id','umem','duration','frames','size']


In [58]:
X = X.drop(columns=to_drop)
X

Unnamed: 0,codec,width,height,bitrate,framerate,i,p,b,i_size,p_size,b_size,o_codec,o_bitrate,o_framerate,o_width,o_height
0,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,12,176,144
1,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,12,320,240
2,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,12,480,360
3,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,12,640,480
4,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,12,1280,720
5,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,12,1920,1080
6,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,15,176,144
7,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,15,320,240
8,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,15,480,360
9,mpeg4,176,144,54590,12,27,1537,0,64483,825054,0,mpeg4,56000,15,640,480


In [59]:
X = pd.get_dummies(X,columns=['codec'])
X = pd.get_dummies(X,columns=['o_codec'])


In [60]:
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X)
X = pd.DataFrame(x_scaled)

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [62]:
youtube_videos.describe()

Unnamed: 0,duration,bitrate,bitrate(video),height,width,frame rate,frame rate(est.)
count,168286.0,168286.0,168286.0,168286.0,168286.0,168286.0,168286.0
mean,271.654184,730.62149,624.363025,561.018706,368.399701,24.564592,19.884441
std,552.881871,919.15473,860.955654,359.071569,201.27418,7.396615,11.43507
min,1.0,0.0,0.0,100.0,88.0,0.0,0.0
25%,55.0,289.0,231.0,320.0,240.0,23.98,12.0
50%,145.0,459.0,349.0,480.0,360.0,29.92,25.0
75%,289.0,826.0,640.0,640.0,480.0,29.97,29.97
max,25845.0,22421.0,22229.0,2592.0,1944.0,59.94,30.02


In [63]:
transcoding_measurements.describe()

Unnamed: 0,duration,width,height,bitrate,framerate,i,p,b,frames,i_size,p_size,b_size,size,o_bitrate,o_framerate,o_width,o_height,umem,utime
count,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0,68784.0
mean,286.413921,624.934171,412.572226,693701.5,23.241321,100.868312,6531.69221,9.147854,6641.708377,2838987.0,22180570.0,0.0,25022940.0,1395036.0,21.190862,802.336357,503.825541,228224.7179,9.996355
std,287.25765,463.169069,240.615472,1095628.0,7.224848,84.764791,6075.871744,92.516177,6153.342453,4325137.0,50973060.0,0.0,54144020.0,1749352.0,6.668703,609.959797,315.970438,97430.878373,16.107429
min,31.08,176.0,144.0,8384.0,5.705752,7.0,175.0,0.0,192.0,11648.0,33845.0,0.0,191879.0,56000.0,12.0,176.0,144.0,22508.0,0.184
25%,106.765,320.0,240.0,134334.0,15.0,39.0,2374.0,0.0,2417.0,393395.0,1851539.0,0.0,2258222.0,109000.0,15.0,320.0,240.0,216820.0,2.096
50%,239.14166,480.0,360.0,291150.0,25.02174,80.0,5515.0,0.0,5628.0,945865.0,6166260.0,0.0,7881069.0,539000.0,24.0,480.0,360.0,219480.0,4.408
75%,379.32,640.0,480.0,652967.0,29.0,138.0,9155.0,0.0,9232.0,3392479.0,15155060.0,0.0,19773350.0,3000000.0,25.0,1280.0,720.0,219656.0,10.433
max,25844.086,1920.0,1080.0,7628466.0,48.0,5170.0,304959.0,9407.0,310129.0,90828550.0,768997000.0,0.0,806711100.0,5000000.0,29.97,1920.0,1080.0,711824.0,224.574


In [73]:
from sklearn.linear_model import Lasso
reg =Lasso()

In [74]:
reg.fit(X_train,y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [75]:
pred = reg.predict(X_test)


In [76]:
mean_squared_error(y_test,pred)

159.33724869664889

In [77]:
mean_absolute_error(y_test,pred)

6.387696469078152

In [78]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [79]:
mean_absolute_percentage_error(y_test,pred)

128.81963133106154

In [81]:
%timeit -n100 reg.fit(X_train,y_train)

14.3 ms ± 198 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
