# Libraries

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gamma, norm, probplot, pearsonr, power_divergence
from typing import List, Tuple

import warnings

warnings.filterwarnings("ignore")

# Auxiliary functions

## calculate_bin(data_column) -> int

In [27]:
def calculate_bin(data_column) -> int:
    return int(1 + 3.3 * np.log10(len(data_column)))

## _generate_bin_interval(data_column) -> List[float]

In [35]:
def _generate_bin_interval(data_column) -> List[float]:
    bin_interval = []
    bin_size = calculate_bin(data_column)
    min_value = data_column.min()
    max_value = data_column.max()
    bin_interval.append(min_value)
    for i in range(1, bin_size):
        bin_interval.append(min_value + (max_value - min_value) / bin_size * i)
    bin_interval.append(max_value)
    return bin_interval

## segment_data_into_bins(first_dataset, second_dataset, column) -> Tuple[pd.DataFrame, pd.DataFrame]

In [66]:
def segment_data_into_bins(first_dataset, second_dataset, column) -> Tuple[pd.DataFrame, pd.DataFrame]:
    bins = _generate_bin_interval(first_dataset[column] if len(first_dataset) < len(second_dataset) else second_dataset[column])

    column_name = column + "_bins"
    bin_legends = [str(bins[i]) + " - " + str(bins[i + 1]) for i in range(len(bins) - 1)]

    first_dataset[column_name] = pd.cut(first_dataset[column], bins, labels=bin_legends, include_lowest=True)
    second_dataset[column_name] = pd.cut(second_dataset[column], bins, labels=bin_legends, include_lowest=True)

    return first_dataset, second_dataset

# Read Data

In [3]:
df_chromecast = pd.read_csv("../data/dataset_chromecast.csv")
df_smart_tv = pd.read_csv("../data/dataset_smart-tv.csv")

# Data Preprocessing

## Log Transformation

### Chromecast

In [4]:
df_chromecast["log_bytes_up"] = np.log10(df_chromecast["bytes_up"] + 1)
df_chromecast["log_bytes_down"] = np.log10(df_chromecast["bytes_down"] + 1)

### Smart TV

In [5]:
df_smart_tv["log_bytes_up"] = np.log10(df_smart_tv["bytes_up"] + 1)
df_smart_tv["log_bytes_down"] = np.log10(df_smart_tv["bytes_down"] + 1)

## Create Hour Column

### Chromecast

In [6]:
df_chromecast["hour"] = df_chromecast["date_hour"].apply(
    lambda x: int(x.split(" ")[1].split(":")[0])
)

### Smart TV

In [7]:
df_smart_tv["hour"] = df_smart_tv["date_hour"].apply(
    lambda x: int(x.split(" ")[1].split(":")[0])
)

# Second Section

## Chromecast

### Histogram

In [None]:
fig = plt.figure()
plt.hist(
    df_chromecast["log_bytes_up"],
    bins=calculate_bin(df_chromecast["log_bytes_up"]),
    color="blue",
    alpha=0.5,
    edgecolor="black",
)
plt.xlabel("Log₁₀(Uploaded Bytes)")
plt.ylabel("Frequency")
plt.title("Chromecast Histogram of Log₁₀(Uploaded Bytes)")
fig.savefig(
    "../images/second_section/chromecast/histogram/chromecast_hist_log_bytes_up.png", 
    dpi=fig.dpi
)

In [None]:
fig = plt.figure()
plt.hist(
    df_chromecast["log_bytes_down"],
    bins=calculate_bin(df_chromecast["log_bytes_down"]),
    color="red",
    alpha=0.5,
    edgecolor="black",
)
plt.xlabel("Log₁₀(Downloaded Bytes)")
plt.ylabel("Frequency")
plt.title("Chromecast Histogram of Log₁₀(Downloaded Bytes)")
fig.savefig(
    "../images/second_section/chromecast/histogram/chromecast_hist_log_bytes_down.png",
    dpi=fig.dpi
)

### Empirical Distribution Function

In [None]:
fig = plt.figure()
plt.plot(
    df_chromecast["log_bytes_up"].sort_values(),
    np.linspace(0, 1, len(df_chromecast["log_bytes_up"])),
    color="blue",
    marker=".",
)
plt.xlabel("Log₁₀(Uploaded Bytes)")
plt.ylabel("Fx(x)")
plt.title("Chromecast Empirical Distribution Function of Log₁₀(Uploaded Bytes)")
fig.savefig(
    "../images/second_section/chromecast/edf/chromecast_edf_log_bytes_up.png", 
    dpi=fig.dpi
)

In [None]:
fig = plt.figure()
plt.plot(
    df_chromecast["log_bytes_down"].sort_values(),
    np.linspace(0, 1, len(df_chromecast["log_bytes_down"])),
    color="red",
    marker=".",
)
plt.xlabel("Log₁₀(Downloaded Bytes)")
plt.ylabel("Fx(x)")
plt.title("Chromecast Empirical Distribution Function of Log₁₀(Downloaded Bytes)")
fig.savefig(
    "../images/second_section/chromecast/edf/chromecast_edf_log_bytes_down.png", 
    dpi=fig.dpi
)

### Boxplot

In [None]:
fig = plt.figure()
plt.boxplot(
    [df_chromecast["log_bytes_up"], df_chromecast["log_bytes_down"]],
    labels=["Log₁₀(Uploaded Bytes)", "Log₁₀(Downloaded Bytes)"],
    patch_artist=True,
    boxprops=dict(facecolor="cyan", color="black"),
    medianprops=dict(color="black"),
    whiskerprops=dict(color="black"),
    capprops=dict(color="black"),
)
plt.ylabel("Log₁₀(Bytes)")
plt.title("Chromecast Boxplots")
fig.savefig(
    "../images/second_section/chromecast/boxplot/chromecast_boxplot_log_bytes_up_down.png",
    dpi=fig.dpi,
)

### Statistical Analysis

In [8]:
chromecast_bytes_up_down = pd.DataFrame(
    {
        "Log₁₀(Uploaded Bytes)": [
            df_chromecast["log_bytes_up"].mean(),
            df_chromecast["log_bytes_up"].var(),
            df_chromecast["log_bytes_up"].std(),
        ],
        "Log₁₀(Downloaded Bytes)": [
            df_chromecast["log_bytes_down"].mean(),
            df_chromecast["log_bytes_down"].var(),
            df_chromecast["log_bytes_down"].std(),
        ],
    },
    index=["Mean", "Variance", "Standard Deviation"],
)
print("Summary Statistics of Log₁₀(Bytes) for Chromecast")
print(chromecast_bytes_up_down.to_markdown())

Summary Statistics of Log₁₀(Bytes) for Chromecast
|                    |   Log₁₀(Uploaded Bytes) |   Log₁₀(Downloaded Bytes) |
|:-------------------|------------------------:|--------------------------:|
| Mean               |                3.3503   |                   3.80005 |
| Variance           |                0.459969 |                   1.6639  |
| Standard Deviation |                0.67821  |                   1.28992 |


## Smart TV

### Histogram

In [None]:
fig = plt.figure()
plt.hist(
    df_smart_tv["log_bytes_up"],
    bins=calculate_bin(df_smart_tv["log_bytes_up"]),
    color="blue",
    alpha=0.5,
    edgecolor="black",
)
plt.xlabel("Log₁₀(Uploaded Bytes)")
plt.ylabel("Frequency")
plt.title("Smart TV Histogram of Log₁₀(Uploaded Bytes)")
fig.savefig(
    "../images/second_section/smart_tv//histogram/smart_tv_hist_log_bytes_up.png", 
    dpi=fig.dpi
)

In [None]:
fig = plt.figure()
plt.hist(
    df_smart_tv["log_bytes_down"],
    bins=calculate_bin(df_smart_tv["log_bytes_down"]),
    color="red",
    alpha=0.5,
    edgecolor="black",
)
plt.xlabel("Log₁₀(Downloaded Bytes)")
plt.ylabel("Frequency")
plt.title("Smart TV histogram of Log₁₀(Downloaded Bytes)")
fig.savefig(
    "../images/second_section/smart_tv/histogram/smart_tv_hist_log_bytes_down.png", 
    dpi=fig.dpi
)

### Empirical Distribution Function

In [None]:
fig = plt.figure()
plt.plot(
    df_smart_tv["log_bytes_up"].sort_values(),
    np.linspace(0, 1, len(df_smart_tv["log_bytes_up"])),
    color="blue",
    marker=".",
)
plt.xlabel("Log₁₀(Uploaded Bytes)")
plt.ylabel("Fx(x)")
plt.title("Smart TV Empirical Distribution Function of Log₁₀(Uploaded Bytes)")
fig.savefig(
    "../images/second_section/smart_tv/edf/smart_tv_edf_log_bytes_up.png", 
    dpi=fig.dpi
)

In [None]:
fig = plt.figure()
plt.plot(
    df_smart_tv["log_bytes_down"].sort_values(),
    np.linspace(0, 1, len(df_smart_tv["log_bytes_down"])),
    color="red",
    marker=".",
)
plt.xlabel("Log₁₀(Downloaded Bytes)")
plt.ylabel("Fx(x)")
plt.title("Smart TV Empirical Distribution Function of Log₁₀(Downloaded Bytes)")
fig.savefig(
    "../images/second_section/smart_tv/edf/smart_tv_edf_bytes_down.png", 
    dpi=fig.dpi
)

### Boxplot

In [None]:
fig = plt.figure()
plt.boxplot(
    [df_smart_tv["log_bytes_up"], df_smart_tv["log_bytes_down"]],
    labels=["Log₁₀(Uploaded Bytes)", "Log₁₀(Downloaded Bytes)"],
    patch_artist=True,
    boxprops=dict(facecolor="cyan", color="black"),
    medianprops=dict(color="black"),
    whiskerprops=dict(color="black"),
    capprops=dict(color="black"),
)
plt.ylabel("Log₁₀(Bytes)")
plt.title("Smart TV Boxplots")
fig.savefig(
    "../images/second_section/smart_tv/boxplot/smart_tv_boxplot_log_bytes_up_down.png", 
    dpi=fig.dpi
)

### Statistical Analysis

In [9]:
smart_tv_bytes_up_down = pd.DataFrame(
    {
        "Log₁₀(Uploaded Bytes)": [
            df_smart_tv["log_bytes_up"].mean(),
            df_smart_tv["log_bytes_up"].var(),
            df_smart_tv["log_bytes_up"].std(),
        ],
        "Log₁₀(Downloaded Bytes)": [
            df_smart_tv["log_bytes_down"].mean(),
            df_smart_tv["log_bytes_down"].var(),
            df_smart_tv["log_bytes_down"].std(),
        ],
    },
    index=["Mean", "Variance", "Standard Deviation"],
)
print("Summary Statistics of Log₁₀(Bytes) for Smart TV")
print(smart_tv_bytes_up_down.to_markdown())

Summary Statistics of Log₁₀(Bytes) for Smart TV
|                    |   Log₁₀(Uploaded Bytes) |   Log₁₀(Downloaded Bytes) |
|:-------------------|------------------------:|--------------------------:|
| Mean               |                 2.15829 |                   2.35168 |
| Variance           |                 4.11014 |                   6.72132 |
| Standard Deviation |                 2.02735 |                   2.59255 |


# Third Section

## Chromecast

### Boxplot

In [None]:
for hour in range(0, 24):
    fig = plt.figure()
    plt.boxplot(
        [
            df_chromecast[df_chromecast["hour"] == hour]["log_bytes_up"],
            df_chromecast[df_chromecast["hour"] == hour]["log_bytes_down"],
        ],
        labels=["Log₁₀(Uploaded Bytes)", "Log₁₀(Downloaded Bytes)"],
        patch_artist=True,
        boxprops=dict(facecolor="cyan", color="black"),
        medianprops=dict(color="black"),
        whiskerprops=dict(color="black"),
        capprops=dict(color="black"),
    )
    plt.ylabel("Log₁₀(Bytes)")
    plt.title(f"Chromecast Boxplots for {hour:02d}:00")
    fig.savefig(
        f"../images/third_section/chromecast/boxplot/chromecast_boxplot_log_bytes_up_down_hour_{hour}.png",
        dpi=fig.dpi,
    )

### Statistical Analysis

In [None]:
fig = plt.figure()
plt.plot(
    df_chromecast.groupby("hour")["log_bytes_up"].mean(),
    color="red",
    marker=".",
    label="Mean",
)
plt.plot(
    df_chromecast.groupby("hour")["log_bytes_up"].var(),
    color="green",
    marker=".",
    label="Variance",
)
plt.plot(
    df_chromecast.groupby("hour")["log_bytes_up"].std(),
    color="blue",
    marker=".",
    label="Standard Deviation",
)
plt.xlabel("Hour")
plt.ylabel("Log₁₀(Bytes)")
plt.legend(loc="best")
plt.title("Chromecast Log₁₀(Uploaded Bytes) Statistics by Hour")
fig.savefig(
    "../images/third_section/chromecast/statistical_analysis/chromecast_line_bytes_up_mean_var_std_hour.png",
    dpi=fig.dpi,
)

In [None]:
fig = plt.figure()
plt.plot(
    df_chromecast.groupby("hour")["log_bytes_down"].mean(),
    color="red",
    marker=".",
    label="Mean",
)
plt.plot(
    df_chromecast.groupby("hour")["log_bytes_down"].var(),
    color="green",
    marker=".",
    label="Variance",
)
plt.plot(
    df_chromecast.groupby("hour")["log_bytes_down"].std(),
    color="blue",
    marker=".",
    label="Standard Deviation",
)
plt.xlabel("Hour")
plt.ylabel("Log₁₀(Bytes)")
plt.legend(loc="best")
plt.title("Chromecast Log₁₀(Downloaded Bytes) Statistics by Hour")
fig.savefig(
    "../images/third_section/chromecast/statistical_analysis/chromecast_line_bytes_down_mean_var_std_hour.png",
    dpi=fig.dpi,
)

## Smart TV

### Boxplot

In [None]:
for hour in range(0, 24):
    fig = plt.figure()
    plt.boxplot(
        [
            df_smart_tv[df_smart_tv["hour"] == hour]["log_bytes_up"],
            df_smart_tv[df_smart_tv["hour"] == hour]["log_bytes_down"],
        ],
        labels=["Log₁₀(Uploaded Bytes)", "Log₁₀(Downloaded Bytes)"],
        patch_artist=True,
        boxprops=dict(facecolor="cyan", color="black"),
        medianprops=dict(color="black"),
        whiskerprops=dict(color="black"),
        capprops=dict(color="black"),
    )
    plt.ylabel("Log₁₀(Bytes)")
    plt.title(f"Smart TV Boxplots for {hour:02d}:00")
    fig.savefig(
        f"../images/third_section/smart_tv/boxplot/smart_tv_boxplot_bytes_up_down_hour_{hour}.png",
        dpi=fig.dpi,
    )

### Statistical Analysis

In [None]:
fig = plt.figure()
plt.plot(
    df_smart_tv.groupby("hour")["log_bytes_up"].mean(),
    color="red",
    marker=".",
    label="Mean",
)
plt.plot(
    df_smart_tv.groupby("hour")["log_bytes_up"].var(),
    color="green",
    marker=".",
    label="Variance",
)
plt.plot(
    df_smart_tv.groupby("hour")["log_bytes_up"].std(),
    color="blue",
    marker=".",
    label="Standard Deviation",
)
plt.xlabel("Hour")
plt.ylabel("Log₁₀(Bytes)")
plt.legend(loc="best")
plt.title("Smart TV Log₁₀(Uploaded Bytes) Statistics by Hour")
fig.savefig(
    "../images/third_section/smart_tv/statistical_analysis/smart_tv_line_bytes_up_mean_var_std_hour.png",
    dpi=fig.dpi,
)

In [None]:
fig = plt.figure()
plt.plot(
    df_smart_tv.groupby("hour")["log_bytes_down"].mean(),
    color="red",
    marker=".",
    label="Mean",
)
plt.plot(
    df_smart_tv.groupby("hour")["log_bytes_down"].var(),
    color="green",
    marker=".",
    label="Variance",
)
plt.plot(
    df_smart_tv.groupby("hour")["log_bytes_down"].std(),
    color="blue",
    marker=".",
    label="Standard Deviation",
)
plt.xlabel("Hour")
plt.ylabel("Log₁₀(Bytes)")
plt.legend(loc="best")
plt.title("Smart TV Log₁₀(Downloaded Bytes) Statistics by Hour")
fig.savefig(
    "../images/third_section/smart_tv/statistical_analysis/smart_tv_line_bytes_down_mean_var_std_hour.png",
    dpi=fig.dpi,
)

# Fourth Section

## Chromecast

### Filter Dataset

In [10]:
hour_of_max_median_up_chromecast = (
    df_chromecast.groupby("hour")["log_bytes_up"].median().idxmax()
)
hour_of_max_mean_up_chromecast = (
    df_chromecast.groupby("hour")["log_bytes_up"].mean().idxmax()
)
hour_of_max_median_down_chromecast = (
    df_chromecast.groupby("hour")["log_bytes_down"].median().idxmax()
)
hour_of_max_mean_down_chromecast = (
    df_chromecast.groupby("hour")["log_bytes_down"].mean().idxmax()
)

In [11]:
df_chromecast_hourly_max = pd.DataFrame(
    {
        "Hour of Max Median": [
            hour_of_max_median_up_chromecast,
            hour_of_max_median_down_chromecast,
        ],
        "Hour of Max Mean": [hour_of_max_mean_up_chromecast, hour_of_max_mean_down_chromecast],
    },
    index=["Log₁₀(Uploaded Bytes)", "Log₁₀(Downloaded Bytes)"],
)
print("Hour of Max Median and Mean for Chromecast")
print(df_chromecast_hourly_max.to_markdown())

Hour of Max Median and Mean for Chromecast
|                         |   Hour of Max Median |   Hour of Max Mean |
|:------------------------|---------------------:|-------------------:|
| Log₁₀(Uploaded Bytes)   |                   22 |                 22 |
| Log₁₀(Downloaded Bytes) |                   23 |                 23 |


In [12]:
df_chromecast_max_median_up = df_chromecast[
    (df_chromecast["hour"] == hour_of_max_median_up_chromecast)
][["hour", "bytes_up", "log_bytes_up", "device_id"]]

df_chromecast_max_mean_up = df_chromecast[
    (df_chromecast["hour"] == hour_of_max_mean_up_chromecast)
][["hour", "bytes_up", "log_bytes_up", "device_id"]]

df_chromecast_max_median_down = df_chromecast[
    (df_chromecast["hour"] == hour_of_max_median_down_chromecast)
][["hour", "bytes_down", "log_bytes_down", "device_id"]]

df_chromecast_max_mean_down = df_chromecast[
    (df_chromecast["hour"] == hour_of_max_mean_down_chromecast)
][["hour", "bytes_down", "log_bytes_down", "device_id"]]

df_chromecast_max_mean_up_same_download_hour = df_chromecast[
    (df_chromecast["hour"] == hour_of_max_mean_down_chromecast)
][["hour", "bytes_up", "log_bytes_up", "device_id"]]

df_chromecast_max_median_up_same_download_hour = df_chromecast[
    (df_chromecast["hour"] == hour_of_max_median_down_chromecast)
][["hour", "bytes_up", "log_bytes_up", "device_id"]]

### Histogram

In [None]:
for df, column, name, figure_name in [
    (df_chromecast_max_median_up, "log_bytes_up", "ChromeCast Max Median Log₁₀(Uploaded Bytes)", "chromecast_max_median_up"),
    (df_chromecast_max_median_down, "log_bytes_down", "ChromeCast Max Median Log₁₀(Downloaded Bytes)", "chromecast_max_median_down"),
    (df_chromecast_max_mean_up, "log_bytes_up", "ChromeCast Max Mean Log₁₀(Uploaded Bytes)", "chromecast_max_mean_up"),
    (df_chromecast_max_mean_down, "log_bytes_down", "ChromeCast Max Mean Log₁₀(Downloaded Bytes)", "chromecast_max_mean_down"),
]:
    chromecast_hist_bin = calculate_bin(df[column])
    fig = plt.figure()
    plt.hist(
        df[column],
        bins=chromecast_hist_bin,
        color=("blue" if column == "log_bytes_up" else "red"),
        alpha=0.5,
        edgecolor="black",
    )
    plt.xlabel("Log₁₀(Uploaded Bytes)" if column == "log_bytes_up" else "Log₁₀(Downloaded Bytes)")
    plt.ylabel("Frequency")
    plt.title(f"Histogram for {name} (Hour {df['hour'].iloc[0]})")
    fig.savefig(
        f"../images/fourth_section/chromecast/histogram/chromecast_histogram_{figure_name}.png",
        dpi=fig.dpi,
    )

### MLE

#### Gamma

In [13]:
for df, column, name in [
    (df_chromecast_max_median_up, "log_bytes_up", "ChromeCast Max Median Log₁₀(Uploaded Bytes)"),
    (df_chromecast_max_median_down, "log_bytes_down", "ChromeCast Max Median Log₁₀(Downloaded Bytes)"),
    (df_chromecast_max_mean_up, "log_bytes_up", "ChromeCast Max Mean Log₁₀(Uploaded Bytes)"),
    (df_chromecast_max_mean_down, "log_bytes_down", "ChromeCast Max Mean Log₁₀(Downloaded Bytes)"),
]:
    shape, loc, scale = gamma.fit(df[column])
    df_gamma = pd.DataFrame(
        {
            "shape": [shape], 
            "loc": [loc], 
            "scale": [scale]
        }
    )
    print(f"Gamma Distribution for {name} (Hour {df['hour'].iloc[0]})")
    print(df_gamma.to_markdown())
    print()

Gamma Distribution for ChromeCast Max Median Log₁₀(Uploaded Bytes) (Hour 22)
|    |   shape |     loc |     scale |
|---:|--------:|--------:|----------:|
|  0 | 3148.88 | -39.809 | 0.0137606 |

Gamma Distribution for ChromeCast Max Median Log₁₀(Downloaded Bytes) (Hour 23)
|    |   shape |      loc |   scale |
|---:|--------:|---------:|--------:|
|  0 | 27.1301 | -3.63137 | 0.28323 |

Gamma Distribution for ChromeCast Max Mean Log₁₀(Uploaded Bytes) (Hour 22)
|    |   shape |     loc |     scale |
|---:|--------:|--------:|----------:|
|  0 | 3148.88 | -39.809 | 0.0137606 |

Gamma Distribution for ChromeCast Max Mean Log₁₀(Downloaded Bytes) (Hour 23)
|    |   shape |      loc |   scale |
|---:|--------:|---------:|--------:|
|  0 | 27.1301 | -3.63137 | 0.28323 |



#### Gaussian

In [14]:
for df, column, name in [
    (df_chromecast_max_median_up, "log_bytes_up", "ChromeCast Max Median Log₁₀(Uploaded Bytes)"),
    (df_chromecast_max_median_down, "log_bytes_down", "ChromeCast Max Median Log₁₀(Downloaded Bytes)"),
    (df_chromecast_max_mean_up, "log_bytes_up", "ChromeCast Max Mean Log₁₀(Uploaded Bytes)"),
    (df_chromecast_max_mean_down, "log_bytes_down", "ChromeCast Max Mean Log₁₀(Downloaded Bytes)"),
]:
    df_mean_median = pd.DataFrame(
        {
            "mean": [df[column].mean()], 
            "median": [df[column].median()]
        }
    )
    print(f"Gaussian Distribution for {name} (Hour {df['hour'].iloc[0]})")
    print(df_mean_median.to_markdown())
    print()

Gaussian Distribution for ChromeCast Max Median Log₁₀(Uploaded Bytes) (Hour 22)
|    |    mean |   median |
|---:|--------:|---------:|
|  0 | 3.52155 |   3.4438 |

Gaussian Distribution for ChromeCast Max Median Log₁₀(Downloaded Bytes) (Hour 23)
|    |   mean |   median |
|---:|-------:|---------:|
|  0 | 4.0527 |  4.28566 |

Gaussian Distribution for ChromeCast Max Mean Log₁₀(Uploaded Bytes) (Hour 22)
|    |    mean |   median |
|---:|--------:|---------:|
|  0 | 3.52155 |   3.4438 |

Gaussian Distribution for ChromeCast Max Mean Log₁₀(Downloaded Bytes) (Hour 23)
|    |   mean |   median |
|---:|-------:|---------:|
|  0 | 4.0527 |  4.28566 |



### Histogram with MLE

In [None]:
for df, column, title, column_name in [
    (df_chromecast_max_median_up, "log_bytes_up", "ChromeCast Max Median Log₁₀(Uploaded Bytes)", "log_bytes_up_max_median"),
    (df_chromecast_max_median_down, "log_bytes_down", "ChromeCast Max Median Log₁₀(Downloaded Bytes)", "log_bytes_down_max_median"),
    (df_chromecast_max_mean_up, "log_bytes_up", "ChromeCast Max Mean Log₁₀(Uploaded Bytes)", "log_bytes_up_max_mean"),
    (df_chromecast_max_mean_down, "log_bytes_down", "ChromeCast Max Mean Log₁₀(Downloaded Bytes)", "log_bytes_down_max_mean"),
]:
    fig = plt.figure()
    chromecast_hist_bin = calculate_bin(df[column])
    plt.hist(
        df[column],
        bins=chromecast_hist_bin,
        color=("blue" if column == "log_bytes_up" else "red"),
        alpha=0.5,
        edgecolor="black",
        label="Histogram",
        density=True,
    )

    x = np.linspace(df[column].min(), df[column].max(), len(df[column]))

    shape, loc, scale = gamma.fit(df[column])
    gamma_distribution = gamma.pdf(x, shape, loc, scale)

    mean, std = norm.fit(df[column])
    gaussian_distribution = norm.pdf(x, mean, std)

    plt.plot(x, gamma_distribution, label="Gamma", color="green")
    plt.plot(x, gaussian_distribution, label="Gaussian", color="orange")
    plt.xlabel("Log₁₀(Uploaded Bytes)" if column == "log_bytes_up" else "Log₁₀(Downloaded Bytes)")
    plt.ylabel("Frequency")
    plt.title(f"Histogram for {title} (Hour {df['hour'].iloc[0]})")
    plt.legend(loc="best")
    fig.savefig(
        f"../images/fourth_section/chromecast/histogram_mle/chromecast_histogram_mle_{column_name}.png",
        dpi=fig.dpi,
    )

### Probability Plot

In [None]:
for df, column, name, figure_name in [
    (df_chromecast_max_median_up, "log_bytes_up", "Chromecast Max Median Log₁₀(Uploaded Bytes)", "chromecast_max_median_up"),
    (df_chromecast_max_median_down, "log_bytes_down", "Chromecast Max Median Log₁₀(Downloaded Bytes)", "chromecast_max_median_down"),
    (df_chromecast_max_mean_up, "log_bytes_up", "Chromecast Max Mean Log₁₀(Uploaded Bytes)", "chromecast_max_mean_up"),
    (df_chromecast_max_mean_down, "log_bytes_down", "Chromecast Max Mean Log₁₀(Downloaded Bytes)", "chromecast_max_mean_down"),
]:

    fig, axes = plt.subplots(1, 2, figsize=(18, 5))

    fig.suptitle(f"Probability Plot for {name} (Hour {df['hour'].iloc[0]})")

    x = df[column]
    probplot(x, dist=gamma, sparams=(shape, loc, scale), plot=axes[0])
    probplot(x, dist=norm, sparams=(df[column].mean(), df[column].std()), plot=axes[1])

    axes[0].set_title(f"Gamma Distribution")
    axes[1].set_title(f"Gaussian Distribution")

    axes[0].set(xlabel="Theoretical Quantiles", ylabel="Ordered Values")
    axes[1].set(xlabel="Theoretical Quantiles", ylabel="Ordered Values")

    fig.savefig(
        f"../images/fourth_section/chromecast/probability_plot/probability_plot_{figure_name}.png",
        dpi=fig.dpi,
    )
    plt.show()

## Smart TV

### Filter Dataset

In [15]:
hour_of_max_median_up_smart_tv = (
    df_smart_tv.groupby("hour")["log_bytes_up"].median().idxmax()
)
hour_of_max_mean_up_smart_tv = (
    df_smart_tv.groupby("hour")["log_bytes_up"].mean().idxmax()
)
hour_of_max_median_down_smart_tv = (
    df_smart_tv.groupby("hour")["log_bytes_down"].median().idxmax()
)
hour_of_max_mean_down_smart_tv = (
    df_smart_tv.groupby("hour")["log_bytes_down"].mean().idxmax()
)

In [16]:
df_smart_tv_hourly_max = pd.DataFrame(
    {
        "Hour of Max Median": [
            hour_of_max_median_up_smart_tv,
            hour_of_max_median_down_smart_tv,
        ],
        "Hour of Max Mean": [hour_of_max_mean_up_smart_tv, hour_of_max_mean_down_smart_tv],
    },
    index=["Log₁₀(Uploaded Bytes)", "Log₁₀(Downloaded Bytes)"],
)
print("Hour of Max Median and Mean for Smart TV")
print(df_smart_tv_hourly_max.to_markdown())

Hour of Max Median and Mean for Smart TV
|                         |   Hour of Max Median |   Hour of Max Mean |
|:------------------------|---------------------:|-------------------:|
| Log₁₀(Uploaded Bytes)   |                   20 |                 20 |
| Log₁₀(Downloaded Bytes) |                   20 |                 20 |


In [17]:
df_smart_tv_max_median_up = df_smart_tv[
    df_smart_tv["hour"] == hour_of_max_median_up_smart_tv
][["device_id", "hour", "bytes_up", "log_bytes_up"]]

df_smart_tv_max_median_down = df_smart_tv[
    df_smart_tv["hour"] == hour_of_max_median_down_smart_tv
][["device_id", "hour", "bytes_down", "log_bytes_down"]]

df_smart_tv_max_mean_up = df_smart_tv[
    df_smart_tv["hour"] == hour_of_max_mean_up_smart_tv
][["device_id", "hour", "bytes_up", "log_bytes_up"]]

df_smart_tv_max_mean_down = df_smart_tv[
    df_smart_tv["hour"] == hour_of_max_mean_down_smart_tv
][["device_id", "hour", "bytes_down", "log_bytes_down"]]

### Histogram

In [None]:
for df, column, name, figure_name in [
    (df_smart_tv_max_median_up, "log_bytes_up", "Smart TV Max Median Log₁₀(Uploaded Bytes)", "smart_tv_max_median_up"),
    (df_smart_tv_max_median_down, "log_bytes_down", "Smart TV  Max Median Log₁₀(Downloaded Bytes)", "smart_tv_max_median_down"),
    (df_smart_tv_max_mean_up, "log_bytes_up", "Smart TV  Max Mean Log₁₀(Uploaded Bytes)", "smart_tv_max_mean_up"),
    (df_smart_tv_max_mean_down, "log_bytes_down", "Smart TV  Max Mean Log₁₀(Downloaded Bytes)", "smart_tv_max_mean_down"),
]:
    smart_tv_hist_bin = calculate_bin(df[column])
    fig = plt.figure()
    plt.hist(
        df[column],
        bins=smart_tv_hist_bin,
        color=("blue" if column == "log_bytes_up" else "red"),
        alpha=0.5,
        edgecolor="black",
    )
    plt.xlabel("Log₁₀(Uploaded Bytes)" if column == "log_bytes_up" else "Log₁₀(Downloaded Bytes)")
    plt.ylabel("Frequency")
    plt.title(f"Histogram for {name} (Hour {df['hour'].iloc[0]})")
    fig.savefig(
        f"../images/fourth_section/smart_tv/histogram/smart_tv_histogram_{figure_name}.png",
        dpi=fig.dpi,
    )

### MLE

#### Gamma

In [18]:
for df, column, name in [
    (df_smart_tv_max_median_up, "log_bytes_up", "Smart TV Max Median Log₁₀(Uploaded Bytes)"),
    (df_smart_tv_max_median_down, "log_bytes_down", "Smart TV Max Median Log₁₀(Downloaded Bytes)"),
    (df_smart_tv_max_mean_up, "log_bytes_up", "Smart TV Max Mean Log₁₀(Uploaded Bytes)"),
    (df_smart_tv_max_mean_down, "log_bytes_down", "Smart TV Max Mean Log₁₀(Downloaded Bytes)"),
]:
    shape, loc, scale = gamma.fit(df[column])
    df_gamma = pd.DataFrame(
        {
            "shape": [shape], 
            "loc": [loc], 
            "scale": [scale]
        }
    )
    print(f"Gamma Distribution for {name} (Hour {df['hour'].iloc[0]})")
    print(df_gamma.to_markdown())
    print()

Gamma Distribution for Smart TV Max Median Log₁₀(Uploaded Bytes) (Hour 20)
|    |   shape |      loc |    scale |
|---:|--------:|---------:|---------:|
|  0 | 217.147 | -23.8596 | 0.124245 |

Gamma Distribution for Smart TV Max Median Log₁₀(Downloaded Bytes) (Hour 20)
|    |   shape |      loc |     scale |
|---:|--------:|---------:|----------:|
|  0 | 896.547 | -71.0622 | 0.0830499 |

Gamma Distribution for Smart TV Max Mean Log₁₀(Uploaded Bytes) (Hour 20)
|    |   shape |      loc |    scale |
|---:|--------:|---------:|---------:|
|  0 | 217.147 | -23.8596 | 0.124245 |

Gamma Distribution for Smart TV Max Mean Log₁₀(Downloaded Bytes) (Hour 20)
|    |   shape |      loc |     scale |
|---:|--------:|---------:|----------:|
|  0 | 896.547 | -71.0622 | 0.0830499 |



#### Gaussian

In [19]:
for df, column, name in [
    (df_smart_tv_max_median_up, "log_bytes_up", "Smart TV Max Median Log₁₀(Uploaded Bytes)"),
    (df_smart_tv_max_median_down, "log_bytes_down", "Smart TV Max Median Log₁₀(Downloaded Bytes)"),
    (df_smart_tv_max_mean_up, "log_bytes_up", "Smart TV Max Mean Log₁₀(Uploaded Bytes)"),
    (df_smart_tv_max_mean_down, "log_bytes_down", "Smart TV Max Mean Log₁₀(Downloaded Bytes)"),
]:
    df_mean_median = pd.DataFrame(
        {
            "mean": [df[column].mean()], 
            "median": [df[column].median()]
        }
    )
    print(f"Gaussian Distribution for {name} (Hour {df['hour'].iloc[0]})")
    print(df_mean_median.to_markdown())
    print()

Gaussian Distribution for Smart TV Max Median Log₁₀(Uploaded Bytes) (Hour 20)
|    |    mean |   median |
|---:|--------:|---------:|
|  0 | 3.12426 |  3.53052 |

Gaussian Distribution for Smart TV Max Median Log₁₀(Downloaded Bytes) (Hour 20)
|    |    mean |   median |
|---:|--------:|---------:|
|  0 | 3.39609 |  2.88961 |

Gaussian Distribution for Smart TV Max Mean Log₁₀(Uploaded Bytes) (Hour 20)
|    |    mean |   median |
|---:|--------:|---------:|
|  0 | 3.12426 |  3.53052 |

Gaussian Distribution for Smart TV Max Mean Log₁₀(Downloaded Bytes) (Hour 20)
|    |    mean |   median |
|---:|--------:|---------:|
|  0 | 3.39609 |  2.88961 |



### Histogram with MLE

In [None]:
for df, column, title, column_name in [
    (df_smart_tv_max_median_up, "log_bytes_up", "Smart TV Max Median Log₁₀(Uploaded Bytes)", "log_bytes_up_max_median"),
    (df_smart_tv_max_median_down, "log_bytes_down", "Smart TV Max Median Log₁₀(Downloaded Bytes)", "log_bytes_down_max_median"),
    (df_smart_tv_max_mean_up, "log_bytes_up", "Smart TV Max Mean Log₁₀(Uploaded Bytes)", "log_bytes_up_max_mean"),
    (df_smart_tv_max_mean_down, "log_bytes_down", "Smart TV Max Mean Log₁₀(Downloaded Bytes)", "log_bytes_down_max_mean"),
]:
    fig = plt.figure()
    smart_tv_hist_bin = calculate_bin(df[column])
    plt.hist(
        df[column],
        bins=smart_tv_hist_bin,
        color=("blue" if column == "log_bytes_up" else "red"),
        alpha=0.5,
        edgecolor="black",
        label="Histogram",
        density=True,
    )

    x = np.linspace(df[column].min(), df[column].max(), len(df[column]))

    shape, loc, scale = gamma.fit(df[column])
    gamma_distribution = gamma.pdf(x, shape, loc, scale)

    mean, std = norm.fit(df[column])
    gaussian_distribution = norm.pdf(x, mean, std)

    plt.plot(x, gamma_distribution, label="Gamma", color="green")
    plt.plot(x, gaussian_distribution, label="Gaussian", color="orange")
    plt.xlabel("Log₁₀(Uploaded Bytes)" if column == "log_bytes_up" else "Log₁₀(Downloaded Bytes)")
    plt.ylabel("Frequency")
    plt.title(f"Histogram for {title} (Hour {df['hour'].iloc[0]})")
    plt.legend(loc="best")
    fig.savefig(
        f"../images/fourth_section/smart_tv/histogram_mle/smart_tv_histogram_mle_{column_name}.png",
        dpi=fig.dpi,
    )

### Probability Plot

In [None]:
for df, column, name, figure_name in [
    (df_smart_tv_max_median_up, "log_bytes_up", "Smart TV Max Median Log₁₀(Uploaded Bytes)", "smart_tv_max_median_up"),
    (df_smart_tv_max_median_down, "log_bytes_down", "Smart TV Max Median Log₁₀(Downloaded Bytes)", "smart_tv_max_median_down"),
    (df_smart_tv_max_mean_up, "log_bytes_up", "Smart TV Max Mean Log₁₀(Uploaded Bytes)", "smart_tv_max_mean_up"),
    (df_smart_tv_max_mean_down, "log_bytes_down", "Smart TV Max Mean Log₁₀(Downloaded Bytes)", "smart_tv_max_mean_down"),
]:

    fig, axes = plt.subplots(1, 2, figsize=(18, 5))

    fig.suptitle(f"Probability Plot for {name} (Hour {df['hour'].iloc[0]})")

    x = df[column]
    probplot(x, dist=gamma, sparams=(shape, loc, scale), plot=axes[0])
    probplot(x, dist=norm, sparams=(df[column].mean(), df[column].std()), plot=axes[1])

    axes[0].set_title(f"Gamma Distribution")
    axes[1].set_title(f"Gaussian Distribution")

    axes[0].set(xlabel="Theoretical Quantiles", ylabel="Ordered Values")
    axes[1].set(xlabel="Theoretical Quantiles", ylabel="Ordered Values")
    fig.savefig(
        f"../images/fourth_section/smart_tv/probability_plot/probability_plot_{figure_name}.png",
        dpi=fig.dpi,
    )
    plt.show()

# Fifth Section

## Sampling Correlation Coefficient

In [80]:
for df1, df2, name in [
    (
        df_smart_tv_max_median_up, 
        df_smart_tv_max_median_down, 
        "Smart TV Max Median Log₁₀(Bytes)"),
    (
        df_smart_tv_max_mean_up, 
        df_smart_tv_max_mean_down, 
        "Smart TV Max Mean Log₁₀(Bytes)"),
    (
        df_chromecast_max_median_up_same_download_hour,
        df_chromecast_max_median_down,
        "Chromecast Max Median Log₁₀(Bytes)"
    ),
    (
        df_chromecast_max_mean_up_same_download_hour, 
        df_chromecast_max_mean_down, 
        "Chromecast Max Median Log₁₀(Bytes)"),
]:
    if df1.shape[0] != df2.shape[0]:
        if df1.shape[0] > df2.shape[0]:
            df1 = df1.sample(n=df2.shape[0])
        else:
            df2 = df2.sample(n=df1.shape[0])

    pearson_coef, p_value = pearsonr(df1["log_bytes_up"], df2["log_bytes_down"])
    df_coef = pd.DataFrame(
        {
            "pearson_coef": [pearson_coef], 
            "p_value": [p_value], "name": [name]
        }
    )
    print(f"Pearson's correlation coefficient for {name}")
    print(df_coef.to_markdown())
    print()

Pearson's correlation coefficient for Smart TV Max Median Log₁₀(Bytes)
|    |   pearson_coef |   p_value | name                             |
|---:|---------------:|----------:|:---------------------------------|
|  0 |       0.915609 |         0 | Smart TV Max Median Log₁₀(Bytes) |

Pearson's correlation coefficient for Smart TV Max Mean Log₁₀(Bytes)
|    |   pearson_coef |   p_value | name                           |
|---:|---------------:|----------:|:-------------------------------|
|  0 |       0.915609 |         0 | Smart TV Max Mean Log₁₀(Bytes) |

Pearson's correlation coefficient for Chromecast Max Median Log₁₀(Bytes)
|    |   pearson_coef |   p_value | name                               |
|---:|---------------:|----------:|:-----------------------------------|
|  0 |       0.792504 |         0 | Chromecast Max Median Log₁₀(Bytes) |

Pearson's correlation coefficient for Chromecast Max Median Log₁₀(Bytes)
|    |   pearson_coef |   p_value | name                               |

## Sample Correlation Coefficient Graph

In [None]:
for df1, df2, title, figure_name in [
    (
        df_smart_tv_max_median_up,
        df_smart_tv_max_median_down,
        "Smart TV Max Median Log₁₀(Uploaded Bytes) and Log₁₀(Downloaded Bytes)",
        "smart_tv_max_median",
    ),
    (
        df_smart_tv_max_mean_up,
        df_smart_tv_max_mean_down,
        "Smart TV Max Mean Log₁₀(Uploaded Bytes) and Log₁₀(Downloaded Bytes)",
        "smart_tv_max_mean",
    ),
    (
        df_chromecast_max_median_up_same_download_hour,
        df_chromecast_max_median_down,
        "Chromecast Max Median Log₁₀(Uploaded Bytes) and Log₁₀(Downloaded Bytes)",
        "chromecast_max_median",
    ),
    (
        df_chromecast_max_mean_up_same_download_hour, 
        df_chromecast_max_mean_down, 
        "Chromecast Max Mean Log₁₀(Uploaded Bytes) and Log₁₀(Downloaded Bytes)",
        "chromecast_max_mean",
    ),
]:
    if df1.shape[0] != df2.shape[0]:
        if df1.shape[0] > df2.shape[0]:
            df1 = df1.sample(n=df2.shape[0])
        else:
            df2 = df2.sample(n=df1.shape[0])

    fig = plt.figure()
    plt.scatter(df1["log_bytes_up"], df2["log_bytes_down"])
    plt.xlabel("Log₁₀(Uploaded Bytes)")
    plt.ylabel("Log₁₀(Downloaded Bytes)")
    plt.title(f"Scatter Plot for {title}")

    folder = (
        f"../images/fifth_section/smart_tv"
        if "smart_tv" in figure_name
        else f"../images/fifth_section/chromecast"
    )

    fig.savefig(
        f"{folder}/scatter_plot/scatter_plot_{figure_name}.png",
        dpi=fig.dpi,
    )

    plt.show()

# Sixth Section

## G-test

In [81]:
df_g_test = pd.DataFrame(columns=["g_test", "p_value", "name", "column"])

for df1, df2, name, column in [
    (
        df_smart_tv_max_median_up,
        df_chromecast_max_median_up,
        "smart_tv_max_median_up_chromecast_max_median_up",
        "log_bytes_up",
    ),
    (
        df_smart_tv_max_mean_up,
        df_chromecast_max_mean_up,
        "smart_tv_max_mean_up_chromecast_max_mean_up",
        "log_bytes_up",
    ),
    (
        df_smart_tv_max_median_down,
        df_chromecast_max_median_down,
        "smart_tv_max_median_down_chromecast_max_median_down",
        "log_bytes_down",
    ),
    (
        df_smart_tv_max_mean_down,
        df_chromecast_max_mean_down,
        "smart_tv_max_mean_down_chromecast_max_mean_down",
        "log_bytes_down",
    ),
]:
    df1, df2 = segment_data_into_bins(df1, df2, column)

    observed_upload_df1 = df1[f"{column}_bins"].value_counts().sort_index()
    observed_upload_df2 = df2[f"{column}_bins"].value_counts().sort_index()

    observed_upload_df1 = observed_upload_df1 / observed_upload_df1.sum()
    observed_upload_df2 = observed_upload_df2 / observed_upload_df2.sum()
    
    g, p = power_divergence(observed_upload_df1, observed_upload_df2, lambda_='log-likelihood')

    df_g_test = df_g_test.append(
        {
            "g_test": g,
            "p_value": p,
            "name": name,
            "column": column,
        },
        ignore_index=True,
    )

print(df_g_test.to_markdown())

|    |   g_test |   p_value | name                                                | column         |
|---:|---------:|----------:|:----------------------------------------------------|:---------------|
|  0 |  1.74065 |  0.999996 | smart_tv_max_median_up_chromecast_max_median_up     | log_bytes_up   |
|  1 |  1.74065 |  0.999996 | smart_tv_max_mean_up_chromecast_max_mean_up         | log_bytes_up   |
|  2 |  2.35922 |  0.999967 | smart_tv_max_median_down_chromecast_max_median_down | log_bytes_down |
|  3 |  2.35922 |  0.999967 | smart_tv_max_mean_down_chromecast_max_mean_down     | log_bytes_down |
