In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load the datasets
train_df = pd.read_csv('./https_training.csv')
test_df = pd.read_csv('./https_test.csv')

# Define target variables and columns to exclude
exclude_columns = ['_s_bytes_uniq']
targets = ['_s_bytes_all']
drop_columns = ['c_ip', 'time']

# Initialize results storage
results = []

# Preprocess for each regression target
for target_column in targets:
    print(f"\n--- Regression for {target_column} ---\n")

    # Prepare training and test datasets
    X_train = train_df.drop(columns=[target_column] + exclude_columns + drop_columns, errors='ignore')
    y_train = train_df[target_column]
    X_test = test_df.drop(columns=[target_column] + exclude_columns + drop_columns, errors='ignore')
    y_test = test_df[target_column]

    # Handle categorical data and one-hot encode
    X_train = pd.get_dummies(X_train, drop_first=True)
    X_test = pd.get_dummies(X_test, drop_first=True)

    # Align training and test features
    X_train, X_test = X_train.align(X_test, join='inner', axis=1)

    # Scale the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=20)  # Adjust based on dataset size and explained variance
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    explained_variance = pca.explained_variance_ratio_.sum()
    print(f"Total Explained Variance by PCA: {explained_variance:.2%}")

    # Define models
    models = {
        "Linear Regression": LinearRegression(),
        "Random Forest": RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
    }

    # Train and evaluate models
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Evaluate performance
        train_mae = mean_absolute_error(y_train, y_train_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        train_mse = mean_squared_error(y_train, y_train_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)

        results.append({
            "Target": target_column,
            "Model": model_name,
            "Train MAE": train_mae,
            "Test MAE": test_mae,
            "Train MSE": train_mse,
            "Test MSE": test_mse,
            "Train R²": train_r2,
            "Test R²": test_r2
        })

# Output results
results_df = pd.DataFrame(results)
print("\n--- Final Regression Results ---")
print(results_df)



--- Regression for _s_bytes_all ---

Total Explained Variance by PCA: 59.08%
Training Linear Regression...
Training Random Forest...
Training Gradient Boosting...

--- Final Regression Results ---
         Target              Model      Train MAE       Test MAE  \
0  _s_bytes_all  Linear Regression  214666.368645  221423.380653   
1  _s_bytes_all      Random Forest   56432.280016  118437.419141   
2  _s_bytes_all  Gradient Boosting  131820.078071  173382.084477   

      Train MSE      Test MSE  Train R²   Test R²  
0  7.432915e+11  2.663221e+12  0.953974  0.969203  
1  1.899029e+11  5.170408e+13  0.988241  0.402098  
2  3.291760e+11  4.715343e+13  0.979617  0.454721  


In [2]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Define parameter grids for hyperparameter tuning
param_grids = {
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [5, 10, 20],
        "min_samples_split": [2, 5, 10]
    },
    "Gradient Boosting": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7]
    }
}

# Tuning and evaluating models
tuned_results = []

for target_column in targets:
    print(f"\n--- Hyperparameter Tuning for {target_column} ---\n")

    # Prepare training and test datasets
    X_train = train_df.drop(columns=[target_column] + exclude_columns + drop_columns, errors='ignore')
    y_train = train_df[target_column]
    X_test = test_df.drop(columns=[target_column] + exclude_columns + drop_columns, errors='ignore')
    y_test = test_df[target_column]

    X_train = pd.get_dummies(X_train, drop_first=True)
    X_test = pd.get_dummies(X_test, drop_first=True)
    X_train, X_test = X_train.align(X_test, join='inner', axis=1)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    pca = PCA(n_components=20, svd_solver='randomized')
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    # Sample dataset for faster tuning
    X_train_sample = X_train[:10000]
    y_train_sample = y_train[:10000]

    for model_name, param_grid in param_grids.items():
        if param_grid:
            print(f"Tuning {model_name}...")
            model = {
                "Random Forest": RandomForestRegressor(random_state=42),
                "Gradient Boosting": GradientBoostingRegressor(random_state=42)
            }.get(model_name)

            # Perform random search
            random_search = RandomizedSearchCV(
                estimator=model,
                param_distributions=param_grid,
                n_iter=20,  # Number of parameter combinations to try
                scoring='neg_mean_absolute_error',
                cv=3,
                verbose=1,
                n_jobs=-1,
                random_state=42
            )
            random_search.fit(X_train_sample, y_train_sample)

            # Evaluate the best model
            best_model = random_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_test_pred = best_model.predict(X_test)

            train_mae = mean_absolute_error(y_train, y_train_pred)
            test_mae = mean_absolute_error(y_test, y_test_pred)
            train_mse = mean_squared_error(y_train, y_train_pred)
            test_mse = mean_squared_error(y_test, y_test_pred)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2 = r2_score(y_test, y_test_pred)

            tuned_results.append({
                "Target": target_column,
                "Model": model_name,
                "Best Params": random_search.best_params_,
                "Train MAE": train_mae,
                "Test MAE": test_mae,
                "Train MSE": train_mse,
                "Test MSE": test_mse,
                "Train R²": train_r2,
                "Test R²": test_r2
            })

# Output tuned results
tuned_results_df = pd.DataFrame(tuned_results)
print("\n--- Tuned Model Results ---")
print(tuned_results_df)



--- Hyperparameter Tuning for _s_bytes_all ---

Tuning Random Forest...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Tuning Gradient Boosting...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

--- Tuned Model Results ---
         Target              Model  \
0  _s_bytes_all      Random Forest   
1  _s_bytes_all  Gradient Boosting   

                                         Best Params      Train MAE  \
0  {'n_estimators': 50, 'min_samples_split': 2, '...  133148.250421   
1  {'n_estimators': 200, 'max_depth': 7, 'learnin...  133165.945705   

        Test MAE     Train MSE      Test MSE  Train R²   Test R²  
0  165689.155137  3.578929e+12  6.203320e+13  0.778387  0.282653  
1  166609.672766  3.875635e+12  6.338125e+13  0.760014  0.267064  


In [4]:
# Load the datasets
train_df_rtt = pd.read_csv('./https_training.csv')  # Changed DataFrame name
test_df_rtt = pd.read_csv('./https_test.csv')  # Changed DataFrame name

# Define target variable and columns to exclude
rtt_target = '_s_rtt_avg'  # Target variable
exclude_rtt_columns = ['_s_rtt_min', '_s_rtt_max', '_s_rtt_stddev']  # Exclude RTT-related columns
drop_rtt_columns = ['c_ip', 'time']  # Drop non-numeric or irrelevant columns

# Remove rows where target is 0
train_df_rtt = train_df_rtt[train_df_rtt[rtt_target] > 0]
test_df_rtt = test_df_rtt[test_df_rtt[rtt_target] > 0]

# Prepare training and test datasets
X_train_rtt = train_df_rtt.drop(columns=[rtt_target] + exclude_rtt_columns + drop_rtt_columns, errors='ignore')
y_train_rtt = train_df_rtt[rtt_target]
X_test_rtt = test_df_rtt.drop(columns=[rtt_target] + exclude_rtt_columns + drop_rtt_columns, errors='ignore')
y_test_rtt = test_df_rtt[rtt_target]

# Handle categorical data and one-hot encode
X_train_rtt = pd.get_dummies(X_train_rtt, drop_first=True)
X_test_rtt = pd.get_dummies(X_test_rtt, drop_first=True)

# Align training and test features
X_train_rtt, X_test_rtt = X_train_rtt.align(X_test_rtt, join='inner', axis=1)

# Scale the features
rtt_scaler = StandardScaler()  # Changed variable name
X_train_rtt = rtt_scaler.fit_transform(X_train_rtt)
X_test_rtt = rtt_scaler.transform(X_test_rtt)

# Apply PCA for dimensionality reduction
rtt_pca = PCA(n_components=15)  # Changed PCA variable name
X_train_rtt = rtt_pca.fit_transform(X_train_rtt)
X_test_rtt = rtt_pca.transform(X_test_rtt)
rtt_explained_variance = rtt_pca.explained_variance_ratio_.sum()
print(f"Total Explained Variance by PCA for _s_rtt_avg: {rtt_explained_variance:.2%}")

# Define models
rtt_models = {  # Changed dictionary name
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
}

# Initialize results storage
rtt_results = []  # Changed results variable name

# Train and evaluate models
for model_name, model in rtt_models.items():
    print(f"Training {model_name}...")
    model.fit(X_train_rtt, y_train_rtt)
    y_train_pred_rtt = model.predict(X_train_rtt)
    y_test_pred_rtt = model.predict(X_test_rtt)

    # Evaluate performance
    train_mae_rtt = mean_absolute_error(y_train_rtt, y_train_pred_rtt)
    test_mae_rtt = mean_absolute_error(y_test_rtt, y_test_pred_rtt)
    train_mse_rtt = mean_squared_error(y_train_rtt, y_train_pred_rtt)
    test_mse_rtt = mean_squared_error(y_test_rtt, y_test_pred_rtt)
    train_r2_rtt = r2_score(y_train_rtt, y_train_pred_rtt)
    test_r2_rtt = r2_score(y_test_rtt, y_test_pred_rtt)

    rtt_results.append({
        "Target": rtt_target,
        "Model": model_name,
        "Train MAE": train_mae_rtt,
        "Test MAE": test_mae_rtt,
        "Train MSE": train_mse_rtt,
        "Test MSE": test_mse_rtt,
        "Train R²": train_r2_rtt,
        "Test R²": test_r2_rtt
    })

# Output results
rtt_results_df = pd.DataFrame(rtt_results)  # Changed DataFrame name
print("\n--- Regression Results for _s_rtt_avg ---")
print(rtt_results_df)


Total Explained Variance by PCA for _s_rtt_avg: 53.88%
Training Linear Regression...
Training Random Forest...
Training Gradient Boosting...

--- Regression Results for _s_rtt_avg ---
       Target              Model   Train MAE     Test MAE     Train MSE  \
0  _s_rtt_avg  Linear Regression  839.796127  1085.696927  8.793663e+06   
1  _s_rtt_avg      Random Forest  405.511200   667.611432  2.791106e+06   
2  _s_rtt_avg  Gradient Boosting  618.625463   803.020388  6.493763e+06   

       Test MSE  Train R²   Test R²  
0  9.897405e+06  0.056837  0.017351  
1  6.577647e+06  0.700640  0.346948  
2  8.296926e+06  0.303513  0.176252  


In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Define parameter grids for hyperparameter tuning
rtt_param_grids = {
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [5, 10, 20],
        "min_samples_split": [2, 5, 10]
    },
    "Gradient Boosting": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7]
    }
}

# Tuning and evaluating models for _s_rtt_avg
rtt_tuned_results = []

# Prepare training and test datasets for _s_rtt_avg
rtt_target_column = '_s_rtt_avg'
X_train_rtt = train_df_rtt.drop(columns=[rtt_target_column] + exclude_rtt_columns + drop_rtt_columns, errors='ignore')
y_train_rtt = train_df_rtt[rtt_target_column]
X_test_rtt = test_df_rtt.drop(columns=[rtt_target_column] + exclude_rtt_columns + drop_rtt_columns, errors='ignore')
y_test_rtt = test_df_rtt[rtt_target_column]

X_train_rtt = pd.get_dummies(X_train_rtt, drop_first=True)
X_test_rtt = pd.get_dummies(X_test_rtt, drop_first=True)
X_train_rtt, X_test_rtt = X_train_rtt.align(X_test_rtt, join='inner', axis=1)

rtt_scaler = StandardScaler()
X_train_rtt = rtt_scaler.fit_transform(X_train_rtt)
X_test_rtt = rtt_scaler.transform(X_test_rtt)

rtt_pca = PCA(n_components=15, svd_solver='randomized')
X_train_rtt = rtt_pca.fit_transform(X_train_rtt)
X_test_rtt = rtt_pca.transform(X_test_rtt)

# Sample dataset for faster tuning
X_train_rtt_sample = X_train_rtt[:10000]
y_train_rtt_sample = y_train_rtt[:10000]

for model_name, param_grid in rtt_param_grids.items():
    print(f"Tuning {model_name}...")
    model = {
        "Random Forest": RandomForestRegressor(random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(random_state=42)
    }.get(model_name)

    # Perform random search
    random_search_rtt = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=20,  # Number of parameter combinations to try
        scoring='neg_mean_absolute_error',
        cv=3,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )
    random_search_rtt.fit(X_train_rtt_sample, y_train_rtt_sample)

    # Evaluate the best model
    best_model_rtt = random_search_rtt.best_estimator_
    y_train_pred_rtt = best_model_rtt.predict(X_train_rtt)
    y_test_pred_rtt = best_model_rtt.predict(X_test_rtt)

    train_mae_rtt = mean_absolute_error(y_train_rtt, y_train_pred_rtt)
    test_mae_rtt = mean_absolute_error(y_test_rtt, y_test_pred_rtt)
    train_mse_rtt = mean_squared_error(y_train_rtt, y_train_pred_rtt)
    test_mse_rtt = mean_squared_error(y_test_rtt, y_test_pred_rtt)
    train_r2_rtt = r2_score(y_train_rtt, y_train_pred_rtt)
    test_r2_rtt = r2_score(y_test_rtt, y_test_pred_rtt)

    rtt_tuned_results.append({
        "Target": rtt_target_column,
        "Model": model_name,
        "Best Params": random_search_rtt.best_params_,
        "Train MAE": train_mae_rtt,
        "Test MAE": test_mae_rtt,
        "Train MSE": train_mse_rtt,
        "Test MSE": test_mse_rtt,
        "Train R²": train_r2_rtt,
        "Test R²": test_r2_rtt
    })

# Output tuned results
rtt_tuned_results_df = pd.DataFrame(rtt_tuned_results)
print("\n--- Tuned Model Results for _s_rtt_avg ---")
print(rtt_tuned_results_df)


Tuning Random Forest...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Tuning Gradient Boosting...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

--- Tuned Model Results for _s_rtt_avg ---
       Target              Model  \
0  _s_rtt_avg      Random Forest   
1  _s_rtt_avg  Gradient Boosting   

                                         Best Params   Train MAE     Test MAE  \
0  {'n_estimators': 200, 'min_samples_split': 2, ...  760.937978  1014.797668   
1  {'n_estimators': 50, 'max_depth': 7, 'learning...  816.165264  1175.760867   

      Train MSE      Test MSE  Train R²   Test R²  
0  5.789805e+06  8.927203e+06  0.379016  0.113676  
1  7.157942e+06  1.228974e+07  0.232276 -0.220169  
