# Libraries

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gamma, norm, probplot, pearsonr, power_divergence

import warnings

warnings.filterwarnings("ignore")

# Auxiliary functions

In [73]:
def calculate_bin(data_column):
    return int(1 + 3.3 * np.log10(len(data_column)))

# Read Data

In [74]:
df_chromecast = pd.read_csv("../data/dataset_chromecast.csv")
df_smart_tv = pd.read_csv("../data/dataset_smart-tv.csv")

# Data Preprocessing

## Log Transformation

### Chromecast

In [75]:
df_chromecast["log_bytes_up"] = np.log10(df_chromecast["bytes_up"])
df_chromecast["log_bytes_down"] = np.log10(df_chromecast["bytes_down"])

df_chromecast["log_bytes_up"] = df_chromecast["log_bytes_up"].replace(-np.inf, 0)
df_chromecast["log_bytes_down"] = df_chromecast["log_bytes_down"].replace(-np.inf, 0)

### Smart TV

In [76]:
df_smart_tv["log_bytes_down"] = np.log10(df_smart_tv["bytes_down"])
df_smart_tv["log_bytes_up"] = np.log10(df_smart_tv["bytes_up"])

df_smart_tv["log_bytes_down"] = df_smart_tv["log_bytes_down"].replace(-np.inf, 0)
df_smart_tv["log_bytes_up"] = df_smart_tv["log_bytes_up"].replace(-np.inf, 0)

## Create Hour Column

### Chromecast

In [77]:
df_chromecast["hour"] = df_chromecast["date_hour"].apply(
    lambda x: int(x.split(" ")[1].split(":")[0])
)

### Smart TV

In [78]:
df_smart_tv["hour"] = df_smart_tv["date_hour"].apply(
    lambda x: int(x.split(" ")[1].split(":")[0])
)

# Second Section

## Chromecast

### Histogram

In [None]:
fig = plt.figure()
plt.hist(
    df_chromecast["log_bytes_up"],
    bins=calculate_bin(df_chromecast["log_bytes_up"]),
    color="blue",
    alpha=0.5,
    edgecolor="black",
)
plt.xlabel("Bytes Up")
plt.ylabel("Frequency")
plt.title("Chromecast histogram of Bytes Up")
fig.savefig(
    "../images/chromecast/general/histogram/chromecast_hist_bytes_up.png", 
    dpi=fig.dpi
)

In [None]:
fig = plt.figure()
plt.hist(
    df_chromecast["log_bytes_down"],
    bins=calculate_bin(df_chromecast["log_bytes_down"]),
    color="red",
    alpha=0.5,
    edgecolor="black",
)
plt.xlabel("Bytes Down")
plt.ylabel("Frequency")
plt.title("Chromecast histogram of Bytes Down")
fig.savefig(
    "../images/chromecast/general/histogram/chromecast_hist_bytes_down.png", 
    dpi=fig.dpi
)

### Empirical Distribution Function

In [None]:
fig = plt.figure()
plt.plot(
    df_chromecast["log_bytes_up"].sort_values(),
    np.linspace(0, 1, len(df_chromecast["log_bytes_up"])),
    color="blue",
    marker=".",
)
plt.xlabel("Bytes Up")
plt.ylabel("Fx(x)")
plt.title("Chromecast Empirical Distribution Function of Bytes Up")
fig.savefig(
    "../images/chromecast/general/edf/chromecast_edf_bytes_up.png", 
    dpi=fig.dpi
)

In [None]:
fig = plt.figure()
plt.plot(
    df_chromecast["log_bytes_down"].sort_values(),
    np.linspace(0, 1, len(df_chromecast["log_bytes_down"])),
    color="red",
    marker=".",
)
plt.xlabel("Bytes Down")
plt.ylabel("Fx(x)")
plt.title("Chromecast Empirical Distribution Function of Bytes Down")
fig.savefig(
    "../images/chromecast/general/edf/chromecast_edf_bytes_down.png", 
    dpi=fig.dpi
)

### Boxplot

In [None]:
fig = plt.figure()
plt.boxplot(
    [df_chromecast["log_bytes_up"], df_chromecast["log_bytes_down"]],
    labels=["Bytes Up", "Bytes Down"],
    patch_artist=True,
    boxprops=dict(facecolor="cyan", color="black"),
    medianprops=dict(color="black"),
    whiskerprops=dict(color="black"),
    capprops=dict(color="black"),
)
plt.ylabel("Bytes")
plt.title("Chromecast Boxplot of Bytes Up and Bytes Down")
fig.savefig(
    "../images/chromecast/general/boxplot/chromecast_boxplot_bytes_up_down.png",
    dpi=fig.dpi,
)

### Statistical Analysis

In [79]:
chromecast_bytes_up_down = pd.DataFrame(
    {
        "Log Bytes Up": [
            df_chromecast["log_bytes_up"].mean(),
            df_chromecast["log_bytes_up"].var(),
            df_chromecast["log_bytes_up"].std(),
        ],
        "Log Bytes Down": [
            df_chromecast["log_bytes_down"].mean(),
            df_chromecast["log_bytes_down"].var(),
            df_chromecast["log_bytes_down"].std(),
        ],
    },
    index=["Mean", "Variance", "Standard Deviation"],
)

print(chromecast_bytes_up_down.to_markdown())

|                    |   Log Bytes Up |   Log Bytes Down |
|:-------------------|---------------:|-----------------:|
| Mean               |       3.34967  |          3.79934 |
| Variance           |       0.4616   |          1.66598 |
| Standard Deviation |       0.679412 |          1.29073 |


## Smart TV

### Histogram

In [None]:
fig = plt.figure()
plt.hist(
    df_smart_tv["log_bytes_up"],
    bins=calculate_bin(df_smart_tv["log_bytes_up"]),
    color="blue",
    alpha=0.5,
    edgecolor="black",
)
plt.xlabel("Bytes Up")
plt.ylabel("Frequency")
plt.title("Smart TV histogram of Bytes Up")
fig.savefig(
    "../images/smart_tv/general/histogram/smart_tv_hist_bytes_up.png", 
    dpi=fig.dpi
)

In [None]:
fig = plt.figure()
plt.hist(
    df_smart_tv["log_bytes_down"],
    bins=calculate_bin(df_smart_tv["log_bytes_down"]),
    color="red",
    alpha=0.5,
    edgecolor="black",
)
plt.xlabel("Bytes Down")
plt.ylabel("Frequency")
plt.title("Smart TV histogram of Bytes Down")
fig.savefig(
    "../images/smart_tv/general/histogram/smart_tv_hist_bytes_down.png", 
    dpi=fig.dpi
)

### Empirical Distribution Function

In [None]:
fig = plt.figure()
plt.plot(
    df_smart_tv["log_bytes_up"].sort_values(),
    np.linspace(0, 1, len(df_smart_tv["log_bytes_up"])),
    color="blue",
    marker=".",
)
plt.xlabel("Bytes Up")
plt.ylabel("Fx(x)")
plt.title("Smart TV Empirical Distribution Function of Bytes Up")
fig.savefig(
    "../images/smart_tv/general/edf/smart_tv_edf_bytes_up.png", 
    dpi=fig.dpi
)

In [None]:
fig = plt.figure()
plt.plot(
    df_smart_tv["log_bytes_down"].sort_values(),
    np.linspace(0, 1, len(df_smart_tv["log_bytes_down"])),
    color="red",
    marker=".",
)
plt.xlabel("Bytes Down")
plt.ylabel("Fx(x)")
plt.title("Smart TV Empirical Distribution Function of Bytes Down")
fig.savefig(
    "../images/smart_tv/general/edf/smart_tv_edf_bytes_down.png", 
    dpi=fig.dpi
)

### Boxplot

In [None]:
fig = plt.figure()
plt.boxplot(
    [df_smart_tv["log_bytes_up"], df_smart_tv["log_bytes_down"]],
    labels=["Bytes Up", "Bytes Down"],
    patch_artist=True,
    boxprops=dict(facecolor="cyan", color="black"),
    medianprops=dict(color="black"),
    whiskerprops=dict(color="black"),
    capprops=dict(color="black"),
)
plt.ylabel("Bytes")
plt.title("Smart TV Boxplot of Bytes Up and Bytes Down")
fig.savefig(
    "../images/smart_tv/general/boxplot/smart_tv_boxplot_bytes_up_down.png", 
    dpi=fig.dpi
)

### Statistical Analysis

In [80]:
smart_tv_bytes_up_down = pd.DataFrame(
    {
        "Log Bytes Up": [
            df_smart_tv["log_bytes_up"].mean(),
            df_smart_tv["log_bytes_up"].var(),
            df_smart_tv["log_bytes_up"].std(),
        ],
        "Log Bytes Down": [
            df_smart_tv["log_bytes_down"].mean(),
            df_smart_tv["log_bytes_down"].var(),
            df_smart_tv["log_bytes_down"].std(),
        ],
    },
    index=["Mean", "Variance", "Standard Deviation"],
)

print(smart_tv_bytes_up_down.to_markdown())

|                    |   Log Bytes Up |   Log Bytes Down |
|:-------------------|---------------:|-----------------:|
| Mean               |        2.15659 |          2.35017 |
| Variance           |        4.11308 |          6.72392 |
| Standard Deviation |        2.02807 |          2.59305 |


# Third Section

## Chromecast

### Boxplot

In [None]:
for hour in range(0, 24):
    fig = plt.figure()
    plt.boxplot(
        [
            df_chromecast[df_chromecast["hour"] == hour]["log_bytes_up"],
            df_chromecast[df_chromecast["hour"] == hour]["log_bytes_down"],
        ],
        labels=["Bytes Up", "Bytes Down"],
        patch_artist=True,
        boxprops=dict(facecolor="cyan", color="black"),
        medianprops=dict(color="black"),
        whiskerprops=dict(color="black"),
        capprops=dict(color="black"),
    )
    plt.ylabel("Bytes")
    plt.title("Chromecast Boxplot of Bytes Up and Bytes Down for Hour " + str(hour))
    fig.savefig(
        f"../images/chromecast/hourly/boxplot/chromecast_boxplot_bytes_up_down_hour_{hour}.png",
        dpi=fig.dpi,
    )

### Statistical Analysis

In [None]:
fig = plt.figure()
plt.plot(
    df_chromecast.groupby("hour")["log_bytes_up"].mean(),
    color="red",
    marker=".",
    label="Mean",
)
plt.plot(
    df_chromecast.groupby("hour")["log_bytes_up"].var(),
    color="green",
    marker=".",
    label="Variance",
)
plt.plot(
    df_chromecast.groupby("hour")["log_bytes_up"].std(),
    color="blue",
    marker=".",
    label="Standard Deviation",
)
plt.xlabel("Hour")
plt.ylabel("Bytes")
plt.legend(loc="best")
plt.title("Chromecast Bytes Up Mean, Variance and Standard Deviation by Hour")
fig.savefig(
    "../images/chromecast/hourly/statistical_analysis/chromecast_line_bytes_up_mean_var_std_hour.png",
    dpi=fig.dpi,
)

In [None]:
fig = plt.figure()
plt.plot(
    df_chromecast.groupby("hour")["log_bytes_down"].mean(),
    color="red",
    marker=".",
    label="Mean",
)
plt.plot(
    df_chromecast.groupby("hour")["log_bytes_down"].var(),
    color="green",
    marker=".",
    label="Variance",
)
plt.plot(
    df_chromecast.groupby("hour")["log_bytes_down"].std(),
    color="blue",
    marker=".",
    label="Standard Deviation",
)
plt.xlabel("Hour")
plt.ylabel("Bytes")
plt.legend(loc="best")
plt.title("Chromecast Bytes Up Mean, Variance and Standard Deviation by Hour")
fig.savefig(
    "../images/chromecast/hourly/statistical_analysis/chromecast_line_bytes_down_mean_var_std_hour.png",
    dpi=fig.dpi,
)

## Smart TV

### Boxplot

In [None]:
for hour in range(0, 24):
    fig = plt.figure()
    plt.boxplot(
        [
            df_smart_tv[df_smart_tv["hour"] == hour]["log_bytes_up"],
            df_smart_tv[df_smart_tv["hour"] == hour]["log_bytes_down"],
        ],
        labels=["Bytes Up", "Bytes Down"],
        patch_artist=True,
        boxprops=dict(facecolor="cyan", color="black"),
        medianprops=dict(color="black"),
        whiskerprops=dict(color="black"),
        capprops=dict(color="black"),
    )
    plt.ylabel("Bytes")
    plt.title("Smart TV Boxplot of Bytes Up and Bytes Down for Hour " + str(hour))
    fig.savefig(
        f"../images/smart_tv/hourly/boxplot/smart_tv_boxplot_bytes_up_down_hour_{hour}.png",
        dpi=fig.dpi,
    )

### Statistical Analysis

In [None]:
fig = plt.figure()
plt.plot(
    df_smart_tv.groupby("hour")["log_bytes_up"].mean(),
    color="red",
    marker=".",
    label="Mean",
)
plt.plot(
    df_smart_tv.groupby("hour")["log_bytes_up"].var(),
    color="green",
    marker=".",
    label="Variance",
)
plt.plot(
    df_smart_tv.groupby("hour")["log_bytes_up"].std(),
    color="blue",
    marker=".",
    label="Standard Deviation",
)
plt.xlabel("Hour")
plt.ylabel("Bytes")
plt.legend(loc="best")
plt.title("Smart TV Bytes Up Mean, Variance and Standard Deviation by Hour")
fig.savefig(
    "../images/smart_tv/hourly/statistical_analysis/smart_tv_line_bytes_up_mean_var_std_hour.png",
    dpi=fig.dpi,
)

In [None]:
fig = plt.figure()
plt.plot(
    df_smart_tv.groupby("hour")["log_bytes_down"].mean(),
    color="red",
    marker=".",
    label="Mean",
)
plt.plot(
    df_smart_tv.groupby("hour")["log_bytes_down"].var(),
    color="green",
    marker=".",
    label="Variance",
)
plt.plot(
    df_smart_tv.groupby("hour")["log_bytes_down"].std(),
    color="blue",
    marker=".",
    label="Standard Deviation",
)
plt.xlabel("Hour")
plt.ylabel("Bytes")
plt.legend(loc="best")
plt.title("Smart TV Bytes Down Mean, Variance and Standard Deviation by Hour")
fig.savefig(
    "../images/smart_tv/hourly/statistical_analysis/smart_tv_line_bytes_down_mean_var_std_hour.png",
    dpi=fig.dpi,
)

# Fourth Section

## Chromecast

### Filter Dataset

In [81]:
hour_of_max_median_up_chromecast = (
    df_chromecast.groupby("hour")["log_bytes_up"].median().idxmax()
)
hour_of_max_mean_up_chromecast = (
    df_chromecast.groupby("hour")["log_bytes_up"].mean().idxmax()
)
hour_of_max_median_down_chromecast = (
    df_chromecast.groupby("hour")["log_bytes_down"].median().idxmax()
)
hour_of_max_mean_down_chromecast = (
    df_chromecast.groupby("hour")["log_bytes_down"].mean().idxmax()
)

In [82]:
print(
    f"""
        Hour of max median bytes up: {hour_of_max_median_up_chromecast},
        Hour of max median bytes down: {hour_of_max_median_down_chromecast}, 
        Hour of max mean bytes up: {hour_of_max_mean_up_chromecast}, 
        Hour of max mean bytes down: {hour_of_max_mean_down_chromecast}
    """
)


        Hour of max median bytes up: 22,
        Hour of max median bytes down: 23, 
        Hour of max mean bytes up: 22, 
        Hour of max mean bytes down: 23
    


In [83]:
df_chromecast_max_median_up = df_chromecast[
    (df_chromecast["hour"] == hour_of_max_median_up_chromecast)
][["hour", "bytes_up", "log_bytes_up", "device_id"]]

df_chromecast_max_mean_up = df_chromecast[
    (df_chromecast["hour"] == hour_of_max_mean_up_chromecast)
][["hour", "bytes_up", "log_bytes_up", "device_id"]]

df_chromecast_max_median_down = df_chromecast[
    (df_chromecast["hour"] == hour_of_max_median_down_chromecast)
][["hour", "bytes_down", "log_bytes_down", "device_id"]]

df_chromecast_max_mean_down = df_chromecast[
    (df_chromecast["hour"] == hour_of_max_mean_down_chromecast)
][["hour", "bytes_down", "log_bytes_down", "device_id"]]

### Histogram

In [None]:
for df, column in [
    (df_chromecast_max_median_up, "log_bytes_up"),
    (df_chromecast_max_median_down, "log_bytes_down"),
    (df_chromecast_max_mean_up, "log_bytes_up"),
    (df_chromecast_max_mean_down, "log_bytes_down"),
]:
    chromecast_hist_bin = calculate_bin(df[column])
    fig = plt.figure()
    plt.hist(
        df[column],
        bins=chromecast_hist_bin,
        color=("blue" if column == "log_bytes_up" else "red"),
        alpha=0.5,
        edgecolor="black",
    )
    plt.xlabel("Bytes")
    plt.ylabel("Frequency")
    plt.title(f"Chromecast Histogram of {column} for Hour " + str(df["hour"].iloc[0]))
    fig.savefig(
        f"../images/chromecast/fourth_section/histogram/chromecast_histogram_{column}_hour_{df['hour'].iloc[0]}.png",
        dpi=fig.dpi,
    )

### MLE

#### Gamma

In [84]:
for df, column, name in [
    (df_chromecast_max_median_up, "log_bytes_up", "chromecast_max_median_up"),
    (df_chromecast_max_median_down, "log_bytes_down", "chromecast_max_median_down"),
    (df_chromecast_max_mean_up, "log_bytes_up", "chromecast_max_mean_up"),
    (df_chromecast_max_mean_down, "log_bytes_down", "chromecast_max_mean_down"),
]:
    shape, loc, scale = gamma.fit(df[column])
    df_gamma = pd.DataFrame({"shape": [shape], "loc": [loc], "scale": [scale]})
    print(f"Gamma MLE parameters for {name}")
    print(df_gamma.to_markdown())
    print()

Gamma MLE parameters for chromecast_max_median_up
|    |   shape |      loc |     scale |
|---:|--------:|---------:|----------:|
|  0 | 5159.29 | -51.9929 | 0.0107599 |

Gamma MLE parameters for chromecast_max_median_down
|    |   shape |      loc |    scale |
|---:|--------:|---------:|---------:|
|  0 | 27.2648 | -3.65507 | 0.282678 |

Gamma MLE parameters for chromecast_max_mean_up
|    |   shape |      loc |     scale |
|---:|--------:|---------:|----------:|
|  0 | 5159.29 | -51.9929 | 0.0107599 |

Gamma MLE parameters for chromecast_max_mean_down
|    |   shape |      loc |    scale |
|---:|--------:|---------:|---------:|
|  0 | 27.2648 | -3.65507 | 0.282678 |



#### Gaussian

In [85]:
for df, column, dataframe_name in [
    (df_chromecast_max_median_up, "log_bytes_up", "df_chromecast_max_median_up"),
    (df_chromecast_max_median_down, "log_bytes_down", "df_chromecast_max_median_down"),
    (df_chromecast_max_mean_up, "log_bytes_up", "df_chromecast_max_mean_up"),
    (df_chromecast_max_mean_down, "log_bytes_down", "df_chromecast_max_mean_down"),
]:
    df_mean_median = pd.DataFrame(
        {"mean": [df[column].mean()], "median": [df[column].median()]}
    )
    print(f"Mean and Median for {dataframe_name}")
    print(df_mean_median.to_markdown())
    print()

Mean and Median for df_chromecast_max_median_up
|    |    mean |   median |
|---:|--------:|---------:|
|  0 | 3.52102 |  3.44365 |

Mean and Median for df_chromecast_max_median_down
|    |    mean |   median |
|---:|--------:|---------:|
|  0 | 4.05207 |  4.28564 |

Mean and Median for df_chromecast_max_mean_up
|    |    mean |   median |
|---:|--------:|---------:|
|  0 | 3.52102 |  3.44365 |

Mean and Median for df_chromecast_max_mean_down
|    |    mean |   median |
|---:|--------:|---------:|
|  0 | 4.05207 |  4.28564 |



### Histogram with MLE

In [None]:
for df, column in [
    (df_chromecast_max_median_up, "log_bytes_up"),
    (df_chromecast_max_median_down, "log_bytes_down"),
    (df_chromecast_max_mean_up, "log_bytes_up"),
    (df_chromecast_max_mean_down, "log_bytes_down"),
]:
    fig = plt.figure()
    chromecast_hist_bin = calculate_bin(df[column])
    plt.hist(
        df[column],
        bins=chromecast_hist_bin,
        color=("blue" if column == "log_bytes_up" else "red"),
        alpha=0.5,
        edgecolor="black",
        label="Histogram",
        density=True,
    )

    x = np.linspace(df[column].min(), df[column].max(), len(df[column]))

    shape, loc, scale = gamma.fit(df[column])
    gamma_distribution = gamma.pdf(x, shape, loc, scale)

    mean, std = norm.fit(df[column])
    gaussian_distribution = norm.pdf(x, mean, std)

    plt.plot(x, gamma_distribution, label="Gamma", color="green")
    plt.plot(x, gaussian_distribution, label="Gaussian", color="orange")
    plt.xlabel("Bytes")
    plt.ylabel("Frequency")
    plt.title(f"Chromecast Histogram of {column} for Hour " + str(df["hour"].iloc[0]))
    plt.legend(loc="best")
    fig.savefig(
        f"../images/chromecast/fourth_section/histogram_mle/chromecast_histogram_mle_{column}_hour_{df['hour'].iloc[0]}.png",
        dpi=fig.dpi,
    )
    print(f"Hour {df['hour'].iloc[0]} - {column} - mean: {mean}, std: {std}")

### Probability Plot

In [None]:
for df, column, name in [
    (df_chromecast_max_median_up, "log_bytes_up", "chromecast_max_median_up"),
    (df_chromecast_max_median_down, "log_bytes_down", "chromecast_max_median_down"),
    (df_chromecast_max_mean_up, "log_bytes_up", "chromecast_max_mean_up"),
    (df_chromecast_max_mean_down, "log_bytes_down", "chromecast_max_mean_down"),
]:
    fig, axes = plt.subplots(1, 2, figsize=(18, 5))

    fig.suptitle(f'Chromecast {column} Hour {df["hour"].iloc[0]}')

    x = df[column]
    probplot(
        x,
        dist=gamma,
        sparams=(shape, loc, scale),
        plot=axes[0],
    )
    probplot(
        x,
        dist=norm,
        sparams=(df[column].mean(), df[column].std()),
        plot=axes[1],
    )

    axes[0].set_title(f"Gamma Distribution - {name}")
    axes[1].set_title(f"Gaussian Distribution - {name}")

    axes[0].set(xlabel="Theoretical Quantiles", ylabel="Ordered Values")
    axes[1].set(xlabel="Theoretical Quantiles", ylabel="Ordered Values")

    plt.show()

## Smart TV

### Filter Dataset

In [86]:
hour_of_max_median_up_smart_tv = (
    df_smart_tv.groupby("hour")["log_bytes_up"].median().idxmax()
)
hour_of_max_mean_up_smart_tv = (
    df_smart_tv.groupby("hour")["log_bytes_up"].mean().idxmax()
)
hour_of_max_median_down_smart_tv = (
    df_smart_tv.groupby("hour")["log_bytes_down"].median().idxmax()
)
hour_of_max_mean_down_smart_tv = (
    df_smart_tv.groupby("hour")["log_bytes_down"].mean().idxmax()
)

In [87]:
print(
    f"""
        Hour of max median bytes up: {hour_of_max_median_up_smart_tv},
        Hour of max median bytes down: {hour_of_max_median_down_smart_tv},
        Hour of max mean bytes up: {hour_of_max_mean_up_smart_tv}, 
        Hour of max mean bytes down: {hour_of_max_mean_down_smart_tv}
    """
)


        Hour of max median bytes up: 20,
        Hour of max median bytes down: 20,
        Hour of max mean bytes up: 20, 
        Hour of max mean bytes down: 20
    


In [88]:
df_smart_tv_max_median_up = df_smart_tv[
    df_smart_tv["hour"] == hour_of_max_median_up_smart_tv
][["device_id", "hour", "bytes_up", "log_bytes_up"]]

df_smart_tv_max_median_down = df_smart_tv[
    df_smart_tv["hour"] == hour_of_max_median_down_smart_tv
][["device_id", "hour", "bytes_down", "log_bytes_down"]]

df_smart_tv_max_mean_up = df_smart_tv[
    df_smart_tv["hour"] == hour_of_max_mean_up_smart_tv
][["device_id", "hour", "bytes_up", "log_bytes_up"]]

df_smart_tv_max_mean_down = df_smart_tv[
    df_smart_tv["hour"] == hour_of_max_mean_down_smart_tv
][["device_id", "hour", "bytes_down", "log_bytes_down"]]

### Histogram

In [None]:
for df, column in [
    (df_smart_tv_max_median_up, "log_bytes_up"),
    (df_smart_tv_max_median_down, "log_bytes_down"),
    (df_smart_tv_max_mean_up, "log_bytes_up"),
    (df_smart_tv_max_mean_down, "log_bytes_down"),
]:
    smart_tv_hist_bin = calculate_bin(df[column])
    fig = plt.figure()
    plt.hist(
        df[column],
        bins=smart_tv_hist_bin,
        color=("blue" if column == "log_bytes_up" else "red"),
        alpha=0.5,
        edgecolor="black",
    )
    plt.xlabel("Bytes")
    plt.ylabel("Frequency")
    plt.title(f"Smart TV Histogram of {column} for Hour " + str(df["hour"].iloc[0]))
    fig.savefig(
        f"../images/smart_tv/fourth_section/histogram/smart_tv_histogram_{column}_hour_{df['hour'].iloc[0]}.png",
        dpi=fig.dpi,
    )

### MLE

#### Gamma

In [89]:
for df, column, name in [
    (df_smart_tv_max_median_up, "log_bytes_up", "smart_tv_max_median_up"),
    (df_smart_tv_max_median_down, "log_bytes_down", "smart_tv_max_median_down"),
    (df_smart_tv_max_mean_up, "log_bytes_up", "smart_tv_max_mean_up"),
    (df_smart_tv_max_mean_down, "log_bytes_down", "smart_tv_max_mean_down"),
]:
    shape, loc, scale = gamma.fit(df[column])
    df_gamma = pd.DataFrame({"shape": [shape], "loc": [loc], "scale": [scale]})
    print(f"Gamma MLE parameters for {name}")
    print(df_gamma.to_markdown())
    print()

Gamma MLE parameters for smart_tv_max_median_up
|    |   shape |      loc |    scale |
|---:|--------:|---------:|---------:|
|  0 | 209.541 | -23.3404 | 0.126262 |

Gamma MLE parameters for smart_tv_max_median_down
|    |   shape |      loc |     scale |
|---:|--------:|---------:|----------:|
|  0 | 893.332 | -71.0756 | 0.0833618 |

Gamma MLE parameters for smart_tv_max_mean_up
|    |   shape |      loc |    scale |
|---:|--------:|---------:|---------:|
|  0 | 209.541 | -23.3404 | 0.126262 |

Gamma MLE parameters for smart_tv_max_mean_down
|    |   shape |      loc |     scale |
|---:|--------:|---------:|----------:|
|  0 | 893.332 | -71.0756 | 0.0833618 |



#### Gaussian

In [90]:
for df, column, dataframe_name in [
    (df_smart_tv_max_median_up, "log_bytes_up", "df_smart_tv_max_median_up"),
    (df_smart_tv_max_median_down, "log_bytes_down", "df_smart_tv_max_median_down"),
    (df_smart_tv_max_mean_up, "log_bytes_up", "df_smart_tv_max_mean_up"),
    (df_smart_tv_max_mean_down, "log_bytes_down", "df_smart_tv_max_mean_down"),
]:
    df_mean_median = pd.DataFrame(
        {"mean": [df[column].mean()], "median": [df[column].median()]}
    )
    print(f"Mean and Median for {dataframe_name}")
    print(df_mean_median.to_markdown())
    print()

Mean and Median for df_smart_tv_max_median_up
|    |   mean |   median |
|---:|-------:|---------:|
|  0 | 3.1228 |  3.53039 |

Mean and Median for df_smart_tv_max_median_down
|    |    mean |   median |
|---:|--------:|---------:|
|  0 | 3.39402 |  2.88905 |

Mean and Median for df_smart_tv_max_mean_up
|    |   mean |   median |
|---:|-------:|---------:|
|  0 | 3.1228 |  3.53039 |

Mean and Median for df_smart_tv_max_mean_down
|    |    mean |   median |
|---:|--------:|---------:|
|  0 | 3.39402 |  2.88905 |



### Histogram with MLE

In [None]:
for df, column in [
    (df_smart_tv_max_median_up, "log_bytes_up"),
    (df_smart_tv_max_median_down, "log_bytes_down"),
    (df_smart_tv_max_mean_up, "log_bytes_up"),
    (df_smart_tv_max_mean_down, "log_bytes_down"),
]:
    fig = plt.figure()
    smart_tv_hist_bin = calculate_bin(df[column])
    plt.hist(
        df[column],
        bins=smart_tv_hist_bin,
        color=("blue" if column == "log_bytes_up" else "red"),
        alpha=0.5,
        edgecolor="black",
        label="Histogram",
        density=True,
    )

    x = np.linspace(df[column].min(), df[column].max(), len(df[column]))

    shape, loc, scale = gamma.fit(df[column])
    gamma_distribution = gamma.pdf(x, shape, loc, scale)

    mean, std = norm.fit(df[column])
    gaussian_distribution = norm.pdf(x, mean, std)

    plt.plot(x, gamma_distribution, label="Gamma", color="green")
    plt.plot(x, gaussian_distribution, label="Gaussian", color="orange")
    plt.xlabel("Bytes")
    plt.ylabel("Frequency")
    plt.title(f"Smart TV Histogram of {column} for Hour " + str(df["hour"].iloc[0]))
    plt.legend(loc="best")
    fig.savefig(
        f"../images/smart_tv/fourth_section/histogram_mle/smart_tv_histogram_mle_{column}_hour_{df['hour'].iloc[0]}.png",
        dpi=fig.dpi,
    )
    print(f"Hour {df['hour'].iloc[0]} - {column} - mean: {mean}, std: {std}")

### Probability Plot

In [None]:
for df, column, name in [
    (df_smart_tv_max_median_up, "log_bytes_up", "smart_tv_max_median_up"),
    (df_smart_tv_max_median_down, "log_bytes_down", "smart_tv_max_median_down"),
    (df_smart_tv_max_mean_up, "log_bytes_up", "smart_tv_max_mean_up"),
    (df_smart_tv_max_mean_down, "log_bytes_down", "smart_tv_max_mean_down"),
]:
    fig, axes = plt.subplots(1, 2, figsize=(18, 5))

    fig.suptitle(f'Smart TV {column} Hour {df["hour"].iloc[0]}')

    x = df[column]
    probplot(x, dist=gamma, sparams=(shape, loc, scale), plot=axes[0])
    probplot(x, dist=norm, sparams=(df[column].mean(), df[column].std()), plot=axes[1])

    axes[0].set_title(f"Gamma Distribution - {name}")
    axes[1].set_title(f"Gaussian Distribution - {name}")

    axes[0].set(xlabel="Theoretical Quantiles", ylabel="Ordered Values")
    axes[1].set(xlabel="Theoretical Quantiles", ylabel="Ordered Values")

    plt.show()

# Fifth Section

## Sampling Correlation Coefficient

In [91]:
for df1, df2, name in [
    (
        df_smart_tv_max_median_up, 
        df_smart_tv_max_median_down, 
        "smart_tv_max_median"),
    (
        df_smart_tv_max_mean_up, 
        df_smart_tv_max_mean_down, 
        "smart_tv_max_mean"),
    (
        df_chromecast_max_median_up,
        df_chromecast_max_median_down,
        "chromecast_max_median",
    ),
    (
        df_chromecast_max_mean_up, 
        df_chromecast_max_mean_down, 
        "chromecast_max_mean"),
]:
    if df1.shape[0] != df2.shape[0]:
        if df1.shape[0] > df2.shape[0]:
            df1 = df1.sample(n=df2.shape[0])
        else:
            df2 = df2.sample(n=df1.shape[0])

    pearson_coef, p_value = pearsonr(df1["log_bytes_up"], df2["log_bytes_down"])
    df_coef = pd.DataFrame(
        {"pearson_coef": [pearson_coef], "p_value": [p_value], "name": [name]}
    )
    print(f"Pearson's correlation coefficient for {name}")
    print(df_coef.to_markdown())
    print()

Pearson's correlation coefficient for smart_tv_max_median
|    |   pearson_coef |   p_value | name                |
|---:|---------------:|----------:|:--------------------|
|  0 |       0.915477 |         0 | smart_tv_max_median |

Pearson's correlation coefficient for smart_tv_max_mean
|    |   pearson_coef |   p_value | name              |
|---:|---------------:|----------:|:------------------|
|  0 |       0.915477 |         0 | smart_tv_max_mean |

Pearson's correlation coefficient for chromecast_max_median
|    |   pearson_coef |   p_value | name                  |
|---:|---------------:|----------:|:----------------------|
|  0 |     0.00326364 |  0.381689 | chromecast_max_median |

Pearson's correlation coefficient for chromecast_max_mean
|    |   pearson_coef |   p_value | name                |
|---:|---------------:|----------:|:--------------------|
|  0 |    0.000710825 |  0.848894 | chromecast_max_mean |



## Sample Correlation Coefficient Graph

In [None]:
for df1, df2, name in [
    (
        df_smart_tv_max_median_up,
        df_smart_tv_max_median_down,
        "smart_tv_max_median_up_smart_tv_max_median_down",
    ),
    (
        df_smart_tv_max_mean_up,
        df_smart_tv_max_mean_down,
        "smart_tv_max_mean_up_smart_tv_max_mean_down",
    ),
    (
        df_chromecast_max_median_up,
        df_chromecast_max_median_down,
        "chromecast_max_median_up_chromecast_max_median_down",
    ),
    (
        df_chromecast_max_mean_up,
        df_chromecast_max_mean_down,
        "chromecast_max_mean_up_chromecast_max_mean_down",
    ),
]:
    min_size = min(len(df1), len(df2))
    df1 = df1.sample(n=min_size)
    df2 = df2.sample(n=min_size)

    fig = plt.figure()
    plt.scatter(df1["log_bytes_up"], df2["log_bytes_down"])
    plt.xlabel("Upload Rate")
    plt.ylabel("Download Rate")
    plt.title(f"Scatter Plot for {name}")
    fig.savefig(
        f"../images/fifth_section/scatter_plot/scatter_plot_{name}.png",
        dpi=fig.dpi,
    )
    plt.show()

# Sixth Section

## G-test

In [92]:
# create an empty dataframe to store the results
# this dataframe will contain the g-test result, the p-value, the name of the test and the name of the column
df_g_test = pd.DataFrame(columns=["g_test", "p_value", "name", "column"])

for df1, df2, name, column in [
    (
        df_smart_tv_max_median_up,
        df_chromecast_max_median_up,
        "smart_tv_max_median_up_chromecast_max_median_up",
        "log_bytes_up",
    ),
    (
        df_smart_tv_max_mean_up,
        df_chromecast_max_mean_up,
        "smart_tv_max_mean_up_chromecast_max_mean_up",
        "log_bytes_up",
    ),
    (
        df_smart_tv_max_median_down,
        df_chromecast_max_median_down,
        "smart_tv_max_median_down_chromecast_max_median_down",
        "log_bytes_down",
    ),
    (
        df_smart_tv_max_mean_down,
        df_chromecast_max_mean_down,
        "smart_tv_max_mean_down_chromecast_max_mean_down",
        "log_bytes_down",
    ),
]:
    observed_upload_df1 = df1[column].value_counts()
    observed_upload_df2 = df2[column].value_counts()

    # resample the dataframes to have the same size
    min_size = min(len(observed_upload_df1), len(observed_upload_df2))
    observed_upload_df1 = observed_upload_df1.sample(n=min_size)
    observed_upload_df2 = observed_upload_df2.sample(n=min_size)

    # normalize the dataframes
    observed_upload_df1 = observed_upload_df1 / observed_upload_df1.sum()
    observed_upload_df2 = observed_upload_df2 / observed_upload_df2.sum()
    
    # Use the G-test to compare the distributions
    g, p = power_divergence(observed_upload_df1, observed_upload_df2, lambda_='log-likelihood')

    # store the results in the g_test dataframe
    df_g_test = df_g_test.append(
        {
            "g_test": g,
            "p_value": p,
            "name": name,
            "column": column,
        },
        ignore_index=True,
    )

print(df_g_test.to_markdown())

|    |   g_test |   p_value | name                                                | column         |
|---:|---------:|----------:|:----------------------------------------------------|:---------------|
|  0 |  1.86886 |         1 | smart_tv_max_median_up_chromecast_max_median_up     | log_bytes_up   |
|  1 |  1.61768 |         1 | smart_tv_max_mean_up_chromecast_max_mean_up         | log_bytes_up   |
|  2 |  1.80951 |         1 | smart_tv_max_median_down_chromecast_max_median_down | log_bytes_down |
|  3 |  1.97811 |         1 | smart_tv_max_mean_down_chromecast_max_mean_down     | log_bytes_down |
