In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from scipy.stats import norm
from scipy import stats
from sklearn.utils import resample
import warnings
# Load the dataset
file_path = "Train_data.csv"
data = pd.read_csv(file_path)
distributions = [
    'norm', 'uniform', 'expon', 'pareto',  'cauchy', 'triang',
    'weibull_min', 'weibull_max', 'gamma', 'beta', 'chi2', 'fisk', 
    't', 'genextreme', 'gumbel_r', 'gumbel_l', 'laplace', 'rayleigh',
    'invgauss', 'halfnorm', 'exponpow', 'exponnorm', 
    'invweibull', 'nakagami', 'johnsonsu', 'genlogistic', 'dweibull'
]
# Split data into features and target
target_column = 'class'
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns.difference([target_column]).tolist()

# Encode target variable
data[target_column] = data[target_column].apply(lambda x: 1 if x == 'anomaly' else 0)

# Remove columns with zero variance
for column in numerical_columns:
    if data[column].std() == 0:
        numerical_columns.remove(column)

# Step to handle class imbalance
class_counts = data[target_column].value_counts()
class_proportions = data[target_column].value_counts(normalize=True)

print("Class Distribution (Counts):")
print(class_counts)

print("\nClass Distribution (Proportions):")
print(class_proportions)

# Check for imbalance
imbalance_threshold = 0.7  # Threshold to determine if dataset is imbalanced
max_proportion = class_proportions.max()

if max_proportion > imbalance_threshold:
    print("\nClass imbalance detected. Balancing the dataset...")
    majority_class = class_counts.idxmax()
    minority_class = class_counts.idxmin()

    # Separate majority and minority classes
    majority_data = data[data[target_column] == majority_class]
    minority_data = data[data[target_column] == minority_class]

    # Resample the minority class to match the majority class size
    balanced_minority_data = resample(
        minority_data,
        replace=True,  # Sample with replacement
        n_samples=majority_data.shape[0],  # Match the majority class size
        random_state=42
    )

    # Combine majority class with the balanced minority class
    data = pd.concat([majority_data, balanced_minority_data])
    print("\nAfter balancing:")
    print(data[target_column].value_counts())
else:
    print("\nDataset is already balanced. No action needed.")

# Train-test split
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42, stratify=data[target_column])


Class Distribution (Counts):
class
0    13449
1    11743
Name: count, dtype: int64

Class Distribution (Proportions):
class
0    0.53386
1    0.46614
Name: proportion, dtype: float64

Dataset is already balanced. No action needed.


In [2]:
# Function to fit PDFs and return MSE + parameters
def fit_distributions(data_column, distributions):
    results = {}
    x = np.linspace(min(data_column), max(data_column), 100)

    for dist_name in distributions:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)  # Suppress warnings
                dist = getattr(stats, dist_name)
                params = dist.fit(data_column)

                # Validate parameters
                if not all(np.isfinite(params)):
                    raise ValueError(f"Invalid parameters for {dist_name}")

                # Calculate PDF and MSE
                pdf_fitted = dist.pdf(x, *params)
                mse = np.mean((pdf_fitted - np.histogram(data_column, bins=100, density=True)[0]) ** 2)

                # Store MSE and parameters
                results[dist_name] = {'MSE': mse, 'Parameters': params}

        except (ValueError, RuntimeError, OverflowError, TypeError):
            pass  # Skip invalid distributions

    return results

# Summarize results for all conditions
def summarize_results_by_condition(data):
    distributions = [
    'norm', 'uniform', 'expon', 'pareto',  'cauchy', 'triang',
    'weibull_min', 'weibull_max', 'gamma', 'beta', 'chi2', 'fisk', 
    't', 'genextreme', 'gumbel_r', 'gumbel_l', 'laplace', 'rayleigh',
    'invgauss', 'halfnorm', 'exponpow', 'exponnorm', 
    'invweibull', 'nakagami', 'johnsonsu', 'genlogistic', 'dweibull'
]

    numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns.difference(['class'])

    # Create dictionaries for each case
    overall_results = {}
    normal_results = {}
    anomaly_results = {}

    for column in numerical_columns:
        col_data = data[column].dropna()
        col_data_normal = data.loc[data['class'] == 'normal', column].dropna()
        col_data_anomaly = data.loc[data['class'] == 'anomaly', column].dropna()

        # No condition: Fit all data
        if not col_data.empty:
            fit_results = fit_distributions(col_data, distributions)
            if fit_results:
                best_fit_name = min(fit_results, key=lambda x: fit_results[x]['MSE'])
                overall_results[column] = {
                    'Best Fit Distribution': best_fit_name,
                    'Parameters': fit_results[best_fit_name]['Parameters']
                }

        # Conditioned on 'normal'
        if not col_data_normal.empty:
            fit_results = fit_distributions(col_data_normal, distributions)
            if fit_results:
                best_fit_name = min(fit_results, key=lambda x: fit_results[x]['MSE'])
                normal_results[column] = {
                    'Best Fit Distribution': best_fit_name,
                    'Parameters': fit_results[best_fit_name]['Parameters']
                }

        # Conditioned on 'anomaly'
        if not col_data_anomaly.empty:
            fit_results = fit_distributions(col_data_anomaly, distributions)
            if fit_results:
                best_fit_name = min(fit_results, key=lambda x: fit_results[x]['MSE'])
                anomaly_results[column] = {
                    'Best Fit Distribution': best_fit_name,
                    'Parameters': fit_results[best_fit_name]['Parameters']
                }

    return overall_results, normal_results, anomaly_results

# Execute and get results
overall_results, normal_results, anomaly_results = summarize_results_by_condition(data)

# Print results
print("\nOverall Results (No Condition):")
for column, details in overall_results.items():
    print(f"{column}: {details}")

print("\nConditioned on 'Normal':")
for column, details in normal_results.items():
    print(f"{column}: {details}")

print("\nConditioned on 'Anomaly':")
for column, details in anomaly_results.items():
    print(f"{column}: {details}")


Overall Results (No Condition):
count: {'Best Fit Distribution': 'cauchy', 'Parameters': (np.float64(6.138683491762478), np.float64(13.649309521830268))}
diff_srv_rate: {'Best Fit Distribution': 'exponnorm', 'Parameters': (np.float64(2610.748598995944), np.float64(-8.283971074110917e-05), np.float64(2.3721966723411414e-05))}
dst_bytes: {'Best Fit Distribution': 'halfnorm', 'Parameters': (np.int64(0), np.float64(88897.56062666258))}
dst_host_count: {'Best Fit Distribution': 'weibull_max', 'Parameters': (np.float64(0.9701176317515333), np.float64(255.00000000000006), np.float64(96.71149741250534))}
dst_host_diff_srv_rate: {'Best Fit Distribution': 'exponnorm', 'Parameters': (np.float64(2926.077808050304), np.float64(-9.678028023168771e-05), np.float64(2.8041023801918117e-05))}
dst_host_rerror_rate: {'Best Fit Distribution': 'exponnorm', 'Parameters': (np.float64(3469.304703144013), np.float64(-0.000123036570600791), np.float64(3.371644043134232e-05))}
dst_host_same_src_port_rate: {'Best

In [3]:
# Add a small value to std to avoid division by zero
def calculate_pdf(column, values, condition):
    mean = column[condition].mean()
    std = column[condition].std() + 1e-5  # Adding a small value to avoid zero std
    return norm.pdf(values, mean, std)

def calculate_pmf(column, values, condition):
    pmf = column[condition].value_counts(normalize=True, dropna=False)
    pmf += 1e-5  # Laplace smoothing
    pmf /= pmf.sum()  # Re-normalize
    return [pmf.get(v, 1e-5) for v in values]

# Compute prior probabilities after balancing
class_weights = train_data[target_column].value_counts(normalize=True)
prior_anomaly = class_weights[1]
prior_no_anomaly = class_weights[0]

def predict(test_data, train_data, priors, numerical_columns, categorical_columns):
    anomaly_probs = []
    no_anomaly_probs = []

    for _, row in test_data.iterrows():
        likelihood_anomaly = 1.0
        likelihood_no_anomaly = 1.0

        for feature in numerical_columns:
            anomaly_pdf = calculate_pdf(train_data[feature], [row[feature]], train_data[target_column] == 1)
            no_anomaly_pdf = calculate_pdf(train_data[feature], [row[feature]], train_data[target_column] == 0)
            likelihood_anomaly *= anomaly_pdf[0]
            likelihood_no_anomaly *= no_anomaly_pdf[0]

        for feature in categorical_columns:
            anomaly_pmf = calculate_pmf(train_data[feature], [row[feature]], train_data[target_column] == 1)
            no_anomaly_pmf = calculate_pmf(train_data[feature], [row[feature]], train_data[target_column] == 0)
            likelihood_anomaly *= anomaly_pmf[0]
            likelihood_no_anomaly *= no_anomaly_pmf[0]

        anomaly_prob = likelihood_anomaly * priors['anomaly']
        no_anomaly_prob = likelihood_no_anomaly * priors['no_anomaly']

        anomaly_probs.append(anomaly_prob)
        no_anomaly_probs.append(no_anomaly_prob)

    return np.array(anomaly_probs), np.array(no_anomaly_probs)

priors = {'anomaly': prior_anomaly, 'no_anomaly': prior_no_anomaly}
anomaly_probs, no_anomaly_probs = predict(test_data, train_data, priors, numerical_columns, categorical_columns)

predictions = (anomaly_probs > no_anomaly_probs).astype(int)
accuracy = accuracy_score(test_data[target_column], predictions)
precision = precision_score(test_data[target_column], predictions)
recall = recall_score(test_data[target_column], predictions)

print("\nModel Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")



Model Performance:
Accuracy: 0.94
Precision: 0.91
Recall: 0.96


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Identifying categorical features that need encoding, excluding the target 'class'
categorical_features_to_encode = data.select_dtypes(include=['object']).columns.drop('class')

# One-hot encoding these features
encoder = OneHotEncoder(sparse_output=False)  
encoded_features = encoder.fit_transform(data[categorical_features_to_encode])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# Merging the encoded features back with the dataset
data_prepared = data.drop(columns=categorical_features_to_encode)
data_prepared = pd.concat([data_prepared, encoded_df], axis=1)

# Splitting the dataset into training and testing sets
X = data_prepared.drop('class', axis=1)
y = data_prepared['class'].apply(lambda x: 1 if x == 'normal' else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training Gaussian Naive Bayes
gaussian_nb = GaussianNB()
gaussian_nb.fit(X_train, y_train)
y_pred_gaussian = gaussian_nb.predict(X_test)

# Training Multinomial Naive Bayes
multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_train, y_train)
y_pred_multinomial = multinomial_nb.predict(X_test)

# Training Bernoulli Naive Bayes
bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X_train, y_train)
y_pred_bernoulli = bernoulli_nb.predict(X_test)

# Collecting performance metrics for each model
gaussian_acc = accuracy_score(y_test, y_pred_gaussian)
multinomial_acc = accuracy_score(y_test, y_pred_multinomial)
bernoulli_acc = accuracy_score(y_test, y_pred_bernoulli)

gaussian_prec = precision_score(y_test, y_pred_gaussian)
multinomial_prec = precision_score(y_test, y_pred_multinomial)
bernoulli_prec = precision_score(y_test, y_pred_bernoulli)

gaussian_recall = recall_score(y_test, y_pred_gaussian)
multinomial_recall = recall_score(y_test, y_pred_multinomial)
bernoulli_recall = recall_score(y_test, y_pred_bernoulli)

# Printing the results
print("Gaussian Naive Bayes:")
print("Accuracy:", gaussian_acc)
print("Precision:", gaussian_prec)
print("Recall:", gaussian_recall)

print("\nMultinomial Naive Bayes:")
print("Accuracy:", multinomial_acc)
print("Precision:", multinomial_prec)
print("Recall:", multinomial_recall)

print("\nBernoulli Naive Bayes:")
print("Accuracy:", bernoulli_acc)
print("Precision:", bernoulli_prec)
print("Recall:", bernoulli_recall)


KeyError: "['class'] not found in axis"