In [179]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt

In [180]:
data_frame = pd.read_csv('dstest.csv', delimiter=',', header=0)
number_of_samples = data_frame.shape[0] 
number_of_features = data_frame.shape[1]

In [181]:
data_frame.iloc[:, 1:].head()

Unnamed: 0,live,mediaEngine,p2p,cdn,upload,peers_count,peers_match,timestamp,sessionDuration,playbackErrorCount,totalPlaybackErrorCount,content
0,VOD,ME1,0.0,8507376.0,0.0,0.0,1,1562469807407,120000.0,0,0,content-09235
1,VOD,ME1,0.0,5778744.0,0.0,0.0,0,1562469927310,240000.0,0,0,content-09235
2,VOD,ME1,0.0,7692960.0,0.0,0.0,0,1562470047330,360000.0,0,0,content-09235
3,VOD,ME1,0.0,8229888.0,0.0,0.0,0,1562470167720,480000.0,0,0,content-09235
4,VOD,ME1,0.0,10387000.0,0.0,0.0,0,1562470287353,600000.0,0,0,content-09235


# Question 1

The easiest solution that comes to my mind: The Bit Rate as a streaming company's performance metric.
Bit rate - the total number of bits of data, transferred per unit of time (most commonly - per second, which means that the unit of measurement - bit/s or so-called bps):

$$Bit\ Rate = BR = \frac{number\ of\ bits,\ bits}{elapsed\ time,\ seconds} = \frac{b}{t}.$$

So all we have to do is calculate the average bitrate value for the given set of samples for each company. In the current case, each sample was taken after a constant time gap - 2 minutes - 120 seconds. Hence, the formula for our bitrate will look like:

$$BR_i = \frac{1}{N_i \cdot \Delta t} \cdot \sum_{j = 1}^{N_i}{b_{ij}},$$
where:
- $i \in \{1,\ ...,\ M\}$ - number of company;
- $M$ - total number of companies;
- $j \in \{1,\ ...,\ N_i\}$ - number of sample;
- $N_i$ - total number of samples for $i^{th}$ company;
- $b_{ij} =  p2p + cdn + upload$ - number of transferred bits of $j^{th}$ sample of $i^{th}$ company;
- $\Delta t = const = 120\ seconds$ - time interval.

In [182]:
def bitrate(data_frame):
    # choosing the features (so-called performance features), necessary to calculate performance: 
    interested_features = ['company', 'p2p', 'cdn', 'upload']

    # computing the mean value of each performance feature for each company:
    grouped_data_frame = data_frame[interested_features].groupby(['company']).mean()

    # adding indices:
    grouped_data_frame.reset_index(inplace=True)

    delta_t = 120 # our constant time interval

    # adding a new feature as a result of a sum of performance features values,
    # multiplied by 8 (from bytes to bits), divided by time interval
    grouped_data_frame['bitRate'] = grouped_data_frame[interested_features[1:]].sum().values * 8 / delta_t
    
    return grouped_data_frame

In [183]:
grouped_data_frame = bitrate(data_frame)
grouped_data_frame[['company', 'bitRate']].head(2)

Unnamed: 0,company,bitRate
0,Flash Infos,1850974.0
1,Sports News,3765295.0


# Question 2

## 2.1
The value of sessionDuration feature of the first user's payload will be the minimum value among all other samples (will be equal to 2 minutes = 120 secons = 120 * 10^3 milliseconds)

## 2.2

In [184]:
min_duration = data_frame[['sessionDuration']].min()[0]

for i in range(number_of_samples):
    if data_frame.at[i, 'sessionDuration'] == min_duration:
        data_frame.at[i, 'isFirstPayload'] = True
    else:
        data_frame.at[i, 'isFirstPayload'] = False
        
data_frame.iloc[:, 1:].head()

Unnamed: 0,live,mediaEngine,p2p,cdn,upload,peers_count,peers_match,timestamp,sessionDuration,playbackErrorCount,totalPlaybackErrorCount,content,isFirstPayload
0,VOD,ME1,0.0,8507376.0,0.0,0.0,1,1562469807407,120000.0,0,0,content-09235,True
1,VOD,ME1,0.0,5778744.0,0.0,0.0,0,1562469927310,240000.0,0,0,content-09235,False
2,VOD,ME1,0.0,7692960.0,0.0,0.0,0,1562470047330,360000.0,0,0,content-09235,False
3,VOD,ME1,0.0,8229888.0,0.0,0.0,0,1562470167720,480000.0,0,0,content-09235,False
4,VOD,ME1,0.0,10387000.0,0.0,0.0,0,1562470287353,600000.0,0,0,content-09235,False


## 2.3

In [185]:
number_of_viewers = data_frame[data_frame['isFirstPayload']].shape[0]
print("Number of different viewers: %d" %number_of_viewers)
data_frame[data_frame['isFirstPayload']].iloc[:, 1:].head()

Number of different viewers: 20834


Unnamed: 0,live,mediaEngine,p2p,cdn,upload,peers_count,peers_match,timestamp,sessionDuration,playbackErrorCount,totalPlaybackErrorCount,content,isFirstPayload
0,VOD,ME1,0.0,8507376.0,0.0,0.0,1,1562469807407,120000.0,0,0,content-09235,True
16,VOD,ME1,0.0,17608456.0,0.0,0.0,1,1562477387430,120000.0,0,0,content-17013,True
21,VOD,ME1,166409328.0,11303312.0,0.0,19.11,38,1562468705805,120000.0,0,0,content-08963,True
53,VOD,ME1,0.0,14936600.0,0.0,0.0,0,1562469688171,120000.0,0,0,content-14278,True
74,VOD,ME1,0.0,15740112.0,0.0,0.0,0,1562478964956,120000.0,0,0,content-16536,True


# Question 3

## 3.1
- 1st playback (T = 2 minutes after start):

<center>playbackErrorCount = 0; <br> totalPlaybackErrorCount = 0;</center>

- 2nd playback (T = 4 minutes after start):

<center>playbackErrorCount = 1; <br> totalPlaybackErrorCount = 1;</center>

- 3rd playback (T = 6 minutes after start):

<center>playbackErrorCount = 0; <br> totalPlaybackErrorCount = 1;</center>

- 4th playback (T = 8 minutes after start):

<center>playbackErrorCount = 1; <br> totalPlaybackErrorCount = 2;</center>

- 5th playback (T = 10 minutes after start):

<center>playbackErrorCount = 0; <br> totalPlaybackErrorCount = 2.</center>

## 3.2

In [186]:
# result value
number_of_viewers_with_errors = 0 

# first sample index of each distinct user:
viewers_start_indices = data_frame[data_frame['isFirstPayload']].index.values

for i in range(number_of_samples):
    if data_frame.at[i, 'playbackErrorCount'] > 0:
        number_of_viewers_with_errors += 1 # increasing result value when an error is found
        
        # end loop if it was the last user:
        if i > viewers_start_indices[number_of_viewers - 1]:
            break
        
        # defining the value of "jump" - the first index of the next user:
        for k in range(number_of_viewers):
            if k < number_of_viewers - 1 and viewers_start_indices[k] <= i < viewers_start_indices[k + 1]:
                # assigning the next user`s first index - 1, because for loop will increase
                # an index automatically at the beginning of the next iteartion:
                i = viewers_start_indices[k + 1] - 1
                break

print("Number of different viewers with playback errors: %d" %number_of_viewers_with_errors)

Number of different viewers with playback errors: 11


# Question 4

## 4.1

In [187]:
from datetime import datetime, timedelta

for i in range(number_of_samples):
    readable_data = (datetime.utcfromtimestamp(data_frame.loc[i]['timestamp'] / 1000) - timedelta(minutes=1)).strftime('%H:%M:%S')
    data_frame.at[i, 'readableDate'] = readable_data

In [188]:
data_frame.iloc[:, 1:].head()

Unnamed: 0,live,mediaEngine,p2p,cdn,upload,peers_count,peers_match,timestamp,sessionDuration,playbackErrorCount,totalPlaybackErrorCount,content,isFirstPayload,readableDate
0,VOD,ME1,0.0,8507376.0,0.0,0.0,1,1562469807407,120000.0,0,0,content-09235,True,03:22:27
1,VOD,ME1,0.0,5778744.0,0.0,0.0,0,1562469927310,240000.0,0,0,content-09235,False,03:24:27
2,VOD,ME1,0.0,7692960.0,0.0,0.0,0,1562470047330,360000.0,0,0,content-09235,False,03:26:27
3,VOD,ME1,0.0,8229888.0,0.0,0.0,0,1562470167720,480000.0,0,0,content-09235,False,03:28:27
4,VOD,ME1,0.0,10387000.0,0.0,0.0,0,1562470287353,600000.0,0,0,content-09235,False,03:30:27


## 4.2 - 4.4

There is no feature "content"