In [None]:
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from scipy.stats import gamma, chisquare, kstest

In [None]:
!pip install plotly


In [None]:
df = pd.read_csv('/content/drive/MyDrive/mqttdataset_reduced.csv')

df = df.loc[df['tcp.len'] > 0]
df = df.loc[df['tcp.time_delta'] > 0]

df = df[['tcp.time_delta', 'tcp.len']]

df.rename(columns={
    'tcp.time_delta': 'gamma_time_delta', 'tcp.len': 'gamma_data_length'
}, inplace=True)

df


In [None]:
# Assuming df['gamma_time_delta'] exists
data = df['gamma_time_delta']

# Remove outliers using 1% and 99% quantiles
lower_quantile = 18e-6
upper_quantile = 804e-6
filtered_data = data[(data >= lower_quantile) & (data <= upper_quantile)]

In [None]:
# Create histogram
fig = go.Figure(
    data=[go.Histogram(
        x=filtered_data,
        nbinsx=5000,
        marker_color='#bd93f9'  # Dracula purple
    )]
)

# Apply Dracula-style dark theme
fig.update_layout(
    title='Simple Histogram',
    xaxis_title='Value',
    yaxis_title='Count',
    template='plotly_dark',
    plot_bgcolor='#282a36',    # Dracula background
    paper_bgcolor='#282a36',
    font=dict(color='#f8f8f2'),  # Dracula foreground
    xaxis=dict(gridcolor='#44475a'),
    yaxis=dict(gridcolor='#44475a')
)

fig.show()

In [None]:
# Fit gamma distribution to filtered data
shape, loc, scale = stats.gamma.fit(filtered_data)
print(f"Gamma fit parameters after outlier removal:\nShape (α): {shape}\nLocation (loc): {loc}\nScale (θ): {scale}")

# Prepare x values for the gamma PDF curve
x = np.linspace(filtered_data.min(), filtered_data.max(), 2000)
pdf = stats.gamma.pdf(x, shape, loc=loc, scale=scale)



In [None]:
# Create histogram trace
hist = go.Histogram(
    x=filtered_data,
    nbinsx=500,
    histnorm='probability density',
    opacity=0.8,
    marker_color='skyblue',
    name='Histogram'
)

# Create gamma PDF line trace
pdf_line = go.Scatter(
    x=x,
    y=pdf,
    mode='lines',
    line=dict(color='red', width=3),
    name='Gamma PDF fit'
)

# Layout with dark theme and zoom
layout = go.Layout(
    title='Gamma Fit for gamma_time_delta (Outliers Removed)',
    template='plotly_dark',
    plot_bgcolor='#282a36',    # Dracula background
    paper_bgcolor='#282a36',
    font=dict(color='#f8f8f2'),  # Dracula foreground
    xaxis=dict(gridcolor='#44475a'),
    yaxis=dict(gridcolor='#44475a')
)

fig = go.Figure(data=[hist, pdf_line], layout=layout)
fig.show()

In [None]:
# 1️⃣ Kolmogorov-Smirnov Test
ks_stat, ks_pvalue = kstest(filtered_data, 'gamma', args=(shape, loc, scale))
print(f"Kolmogorov-Smirnov Test:")
print(f"Statistic: {ks_stat:.4f}, p-value: {ks_pvalue:.4f}")

In [None]:
# 2️⃣ Chi-Squared Test
n_bins = 20
quantiles = np.quantile(filtered_data, np.linspace(0, 1, n_bins + 1))
observed_freq, _ = np.histogram(filtered_data, bins=quantiles)

# Compute expected frequency for each bin from the gamma CDF
expected_freq = []
for i in range(n_bins):
    lower = gamma.cdf(quantiles[i], shape, loc, scale)
    upper = gamma.cdf(quantiles[i + 1], shape, loc, scale)
    expected = (upper - lower) * len(filtered_data)
    expected_freq.append(expected)

# ⚠️ Normalize expected so it matches observed total
expected_freq = np.array(expected_freq)
expected_freq *= observed_freq.sum() / expected_freq.sum()

# Chi-squared test
chi2_stat, chi2_pvalue = chisquare(f_obs=observed_freq, f_exp=expected_freq)
print(f"\nChi-Squared Test:\nStatistic: {chi2_stat:.4f}, p-value: {chi2_pvalue:.4f}")


In [None]:
observed_freq