# 📌 1. Loading the Dataset

In [None]:
from os.path import join as path_join
import kagglehub
import numpy as np
import pandas as pd
import seaborn as sns
from bokeh.core.property.primitive import String
from holoviews.ipython import display
from matplotlib import pyplot as plt
from scipy import stats
from scipy.stats import zscore, kstest, pearsonr, spearmanr
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error,accuracy_score
from sklearn.preprocessing import OneHotEncoder
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) # suppress mathematically undefined errors


In [None]:
# Load dataset from URL
data_root = kagglehub.dataset_download("andrewkronser/cve-common-vulnerabilities-and-exposures")


## 💡 **Interpretation**:

- **mod_date: The date the entry was last modified.**
- **pub_date: The date the entry was published.**
- **cvss: Common Vulnerability Scoring System (CVSS) score, which measures the severity of a vulnerability.**
- **cwe_code: Common Weakness Enumeration (CWE) code identifying the type of weakness.**
- **cwe_name: The name associated with the CWE code.**
- **summary: A text summary of the vulnerability.**
- **access_authentication: Indicates whether authentication is required.**
- **access_complexity: How difficult the attack is to execute.**
- **access_vector: How the attack is performed(e.g., via network or locally).**

In [None]:
# Read the dataset into a pandas dataframe
df = pd.read_csv(path_join(data_root, 'cve.csv'), header=0, index_col=0)
df.mod_date = pd.to_datetime(df.mod_date)
df.pub_date = pd.to_datetime(df.pub_date)

# Check for missing values before summary statistics
print("\nMissing Values:")
print(df.isnull().sum())

# General information about the DataFrame (column names, non-null counts, data types)
print("\nDataset Info:")
df.info()


non_par_com = {"NONE": 0, "COMPLETE": 2, "PARTIAL": 1}
low_med_hih = {"LOW": 0, "MEDIUM": 1, "HIGH": 2}
non_sin_mul = {"NONE": 0, "SINGLE": 1, "MULTIPLE": 2}
loc_adj_net = {"LOCAL": 0, "ADJACENT_NETWORK": 1, "NETWORK": 2}

ordinal_remapping = {
    "access_authentication": non_sin_mul,
    "access_complexity": low_med_hih,
    "access_vector": loc_adj_net,
    "impact_availability": non_par_com,
    "impact_confidentiality": non_par_com,
    "impact_integrity": non_par_com,
}

for ordinal_column in ordinal_remapping:
    df[ordinal_column] = df[ordinal_column].apply(
        lambda v: ordinal_remapping[ordinal_column].get(v, v)
    )


In [None]:
# Summary statistics for numeric columns (e.g., mean, min, max)
print("\nSummary Statistics:")
print(df.describe())

In [None]:
# Display the column names
print(df.columns.tolist())

# 🧼 2. Handling Missing Data

In [None]:
# Check for missing values in each column
missing_counts = df.isnull().sum()
print("Missing Data Count:\n", missing_counts)

# Remove rows with missing values
cleaned_df = df.dropna()
print("\nCleaned DataFrame Head:")
print(cleaned_df.head())


In [None]:
# Extract the 'summary' column as a NumPy array
summary_array = df['summary'].to_numpy()

# Show the first 10 values to understand the data
print(summary_array[:10])

#  🧪 3. Time-Series Data

In [None]:
# Convert 'pub_date' column to datetime format
df['pub_date'] = pd.to_datetime(df['pub_date'])  # Convert to datetime format
df.set_index('pub_date', inplace=True) # Display the converted dates

In [None]:
# General count of events over time
df['pub_year'] = df.index.year
year_x_count = df.groupby("pub_year").size().reset_index(name="count")
fig = px.line(year_x_count, x="pub_year", y="count", log_y=True, title="CVE Code Count Over Time (Log Scale)")
fig.show()

In [None]:
# Extract year, month, and day from the datetime index
df['year'] = df.index.year  # Extract year
df['month'] = df.index.month  # Extract month
df['day'] = df.index.day  # Extract day
print(df[['year', 'month', 'day']].head(10))  # Display the extracted columns

# Count occurrences of each cwe_name per year
cwe_counts = df.groupby(['year', 'cwe_name']).size().reset_index(name='count')

# Get the most recent year
most_recent_year = cwe_counts['year'].max()

# Filter data for the most recent year
cwe_counts_recent = cwe_counts[cwe_counts['year'] == most_recent_year]

# Get the top 5 cwe_names by count in the most recent year
top_5_cwe_names = cwe_counts_recent.nlargest(5, 'count')['cwe_name'].tolist()

# Filter the original DataFrame for the top 5 cwe_names
df_top5 = df[df['cwe_name'].isin(top_5_cwe_names)]

# Count occurrences of each top cwe_name over all years
top5_counts_over_time = df_top5.groupby(['year', 'cwe_name']).size().reset_index(name='count')


# Plotting
plt.figure(figsize=(12, 6))
sns.lineplot(data=top5_counts_over_time, x='year', y='count', hue='cwe_name', marker='o')

plt.xticks(rotation=45, ha='right')
plt.title("Top 5 CWE Names Over Time (Counts)")
plt.ylabel("Number of Occurrences")
plt.xlabel("Year")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Calculate time differences from the earliest date
print("Calculating Time Differences from the Start Date:")
df['Time_from_start'] = df.index - df.index.min()  # Calculate timedelta from the first date
print(df[['Time_from_start']].head())  # Display the time differences

In [None]:
target_cwe_code = 79
# Create a time series of the counts of the target CWE
time_series = df[df['cwe_code'] == target_cwe_code].groupby('year')['cwe_code'].count()

# Convert the index to datetime (assuming the index represents years)
# This is important for time series analysis functions
time_series.index = pd.to_datetime(time_series.index, format='%Y')

# Analyze Trend Component using Linear Regression
# Convert datetime index to a numeric format for Linear Regression
X = time_series.index.values.astype(np.int64).reshape(-1, 1)  # Use numeric representation of dates
y = time_series.values  # Counts as target
trend_model = LinearRegression()
trend_model.fit(X, y)
trend_line = trend_model.predict(X)

# Time Series prediction using Exponential Smoothing
# Consider if 'seasonal_periods=2' is appropriate for yearly data
# If no clear 2-year cycle is expected, remove seasonal parameters or set seasonal=None
model = ExponentialSmoothing(time_series, trend='add', seasonal='add', seasonal_periods=2)
model_fit = model.fit()
predict_steps = 5  # Number of years to forecast into the future
predict = model_fit.forecast(steps=predict_steps)

# Create a date range for the forecast period
# Start from the end of the original time series index
forecast_dates = pd.date_range(start=time_series.index[-1], periods=predict_steps + 1, freq='YS-JAN')[1:]

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(time_series.index, time_series.values, label='Actual')
plt.plot(time_series.index, trend_line, label='Trend (Linear Regression)', linestyle='--')
plt.plot(forecast_dates, predict, label='Prediction (Exponential Smoothing)', linestyle='--', marker='o')  # Use forecast_dates
plt.xlabel("Year")
plt.ylabel("Count of CWE-79")
plt.title("CWE-79 Trend and Prediction")
plt.legend()
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()

# 📐 4. Probability Distribution & Descriptive Stats

**The McCumber Cube is a model framework created by John McCumber in 1991 to help organizations establish and evaluate information security initiatives by considering all of the related factors that impact them.**

**This security model has three dimensions:**

**The foundational principles for protecting information systems.**

**1.Availability, 2.Integrity, 3.Confidentiality ✅**

**The protection of information in each of its possible states.**

**The security measures used to protect data.**

## 4.1 Minimum and Maximum Values

In [None]:
# Min and Max of DataFrame columns
print("Minimum values in each column:")
print(df.min(numeric_only=True))

print("\nMaximum values in each column:")
print(df.max(numeric_only=True))


## 4.2 Median and Mode

In [None]:
# Calculate median cwe_code
median_cwe_code = df["cwe_code"].median()
print(f"Median cwe_code: {median_cwe_code}")

# Calculate the mode of cwe_code
mode_cwe_code = df["cwe_code"].mode()
print(f"Most Common cwe_code: {mode_cwe_code[0]}, Count: {df['cwe_code'].value_counts()[mode_cwe_code[0]]}")


In [None]:
# Pie Chart: Top 3 codes by highest cvss scores

# Select the top 3 codes with cvss scores
df_latest = df[df['mod_date'] == df['mod_date'].max()]
top3_codes = df_latest.nlargest(10, 'cvss')

plt.figure(figsize=(8, 8))  # Set figure size

# Create the pie chart
plt.pie(
    top3_codes['cvss'],  # Values for the pie slices
    labels=top3_codes['cwe_code'],  # Labels for each slice
    autopct='%1.1f%%',  # Display percentage on slices
    startangle=140,  # Rotate start angle for better layout
    shadow=True  # Add shadow for visual effect
)

# Add a title
plt.title('Top 3 Codes by Highest CVSS Score')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

# CWE-732: Incorrect Permission Assignment for Critical Resource
# Cross-Site Request Forgery (CSRF) - (352)
# Weaknesses in the 2024 CWE Top 25 Most Dangerous Software Weaknesses

## 4.3 Quantiles and Interquartile Range (IQR)

In [None]:
# Convert 'pub_date' column to datetime format
# df['pub_date'] = pd.to_datetime(df['pub_date']) , y=df["pub_date"]
# Calculate Q1 (25th percentile), Q3 (75th percentile), and IQR for cwe_code
q1_cwe_code = df["cwe_code"].quantile(0.25)
q3_cwe_code = df["cwe_code"].quantile(0.75)
iqr_cwe_code = q3_cwe_code - q1_cwe_code

print(f"Q1 (25th percentile of cwe_code): {q1_cwe_code:.2f}")
print(f"Q3 (75th percentile of cwe_code): {q3_cwe_code:.2f}")
print(f"Interquartile Range (IQR) of cwe_code: {iqr_cwe_code:.2f}")

# Visualize IQR using a boxplot
plt.figure(figsize=(8, 5))
sns.boxplot(x=df["cwe_code"])
plt.title("Boxplot of cwe_code Distribution")
plt.show()

# Weaknesses in the 2024 CWE Top 25 Most Dangerous Software Weaknesses
# Improper Neutralization of Input During Web Page Generation ('Cross-site Scripting') - (79)

# 📊 5.Visualising Relationships

## 📊 5.1 Distribution Shapes

In [None]:
# Value counts of all entries in 'cwe_name' that contain 'Improper'
improper_value_counts = df.loc[df['cwe_name'].str.contains('Improper', case=False, na=False), 'cwe_name'].value_counts()
print(f"\nValue counts of 'cwe_name' entries containing 'Improper':\n{improper_value_counts}")

# Create a new DataFrame from the value counts
improper_counts_df = pd.DataFrame({'cwe_name': improper_value_counts.index, 'count': improper_value_counts.values})

# Calculate mean, variance, and std of the 'val' column in improper_counts_df
improper_mean_val = improper_counts_df['count'].mean()
improper_var_val =  improper_counts_df['count'].var()     # sample variance
improper_std_val =  improper_counts_df['count'].std()     # sample standard deviation

print(f"Mean (Expected Value) for Improper: {improper_mean_val:.4f}")
print(f"Variance for Improper: {improper_var_val:.4f}")
print(f"Standard Deviation for Improper: {improper_std_val:.4f}")


# Value counts of all entries in 'cwe_name' that contain 'Exposure'
Exposure_value_counts = df.loc[df['cwe_name'].str.contains('Exposure', case=False, na=False), 'cwe_name'].value_counts()
print(f"\nValue counts of 'cwe_name' entries containing 'Exposure':\n{Exposure_value_counts}")

# Create a new DataFrame from the value counts
Exposure_counts_df = pd.DataFrame({'cwe_name': Exposure_value_counts.index, 'count': Exposure_value_counts.values})

# # Calculate mean, variance, and std of the 'val' column in Exposure_counts_df
Exposure_mean_val = Exposure_counts_df['count'].mean()
Exposure_var_val =  Exposure_counts_df['count'].var()     # sample variance
Exposure_std_val =  Exposure_counts_df['count'].std()     # sample standard deviation

print(f"Mean (Expected Value) for Exposure: {Exposure_mean_val:.4f}")
print(f"Variance for Exposure: {Exposure_var_val:.4f}")
print(f"Standard Deviation for Exposure: {Exposure_std_val:.4f}")

# Create a DataFrame for the means and standard deviations
stats = pd.DataFrame({
    'Mean_Exposure': [Exposure_mean_val], 
    'Mean_Improper': [improper_mean_val],
    'Std_Exposure': [Exposure_std_val],
    'Std_Improper': [improper_std_val]
})

# Visualize distribution
plt.hist(Exposure_counts_df['count'], bins=30, alpha=0.5, label='Exposure')
plt.hist(improper_counts_df['count'], bins=30, alpha=0.5, label='Improper')
plt.title("Histogram of 'Exposure' and 'Improper' Value Counts")
plt.xlabel("Counts")
plt.ylabel("Frequency")
plt.legend()
plt.show()

# Create a DataFrame for the means
means = pd.DataFrame({'Exposure': [Exposure_mean_val], 'Improper': [improper_mean_val]})

# Create a DataFrame for the standard deviations
stds = pd.DataFrame({'Exposure': [Exposure_std_val], 'Improper': [improper_std_val]})

# Plot the bar chart with error bars
means.plot.bar(yerr=stds, rot=0, capsize=4) # capsize adds caps to error bars
plt.title("Mean with Standard Deviation Error Bars")
plt.xlabel('Exposure & Improper')
plt.ylabel("Value")
plt.tight_layout()
plt.show()



## 📊 5.2 Poisson distribution

In [None]:
# Estimate the mean of cvss
lambda_value = df["cvss"].mean()

# Generate Poisson distribution
poisson_data = np.random.poisson(lam=lambda_value, size=1000)

# Plot distribution
sns.histplot(poisson_data, kde=True, bins=30, color="blue")
plt.title("Poisson Distribution of cvss")
plt.xlabel("cvss")
plt.ylabel("Frequency")
plt.show()



## 📦 5.3 Detect, Report, and Visualize Outliers Using Z-Score

In [None]:
# Detect, Report, and Visualize Outliers Using Z-Score

def visualize_outliers(df, threshold=3):
    df_numeric = df.select_dtypes(include=['number'])

    # Calculate Z-scores
    z_scores = df_numeric.apply(zscore, nan_policy='omit')

    # Count how many values are considered outliers
    outlier_counts = (z_scores.abs() > threshold).sum()
    print("Number of outliers detected per column:\n", outlier_counts)

    # Summary statistics
    print("\n--- Summary Statistics ---")
    display(df_numeric.describe().T)

    # Boxplot visualization
    print("\n Boxplots to Inspect Outliers:")
    df_numeric.plot(kind='box', subplots=True, layout=(1, len(df_numeric.columns)), figsize=(16, 4), patch_artist=True)
    plt.tight_layout()
    plt.show()

# Apply outlier visualization
visualize_outliers(df)


##  5.4 Analysing Correlation Between Variables

###  5.4.1 Covariance Analysis

In [None]:
# Covariance Analysis

# Calculate a rolling mean to smooth out short-term fluctuations
df['rolling_mean'] = df['cvss'].rolling(window=30).mean()

# Remove rows with NaN values introduced by the rolling window
df_clean = df.dropna()

# Calculate the covariance matrix between the original values and their rolling mean
cov_matrix = df_clean[['cvss', 'rolling_mean']].cov()

# Display the covariance matrix
print("\nCovariance Matrix:")
print(cov_matrix)

# Visualize the covariance matrix using a heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(cov_matrix, annot=True, cmap="coolwarm", center=0)

# Add plot title
plt.title("Covariance Matrix Heatmap")

# Show the plot
plt.show()


###  5.4.2 Pearson: linear correlation & Spearman: rank-based correlation

In [None]:
# Calculate correlation matrices using different methods:

pearson_corr = cov_matrix.corr(method='pearson') # - Pearson: linear correlation (assumes normality)
spearman_corr = cov_matrix.corr(method='spearman') # - Spearman: rank-based correlation (monotonic relationships)
kendall_corr = cov_matrix.corr(method='kendall') # - Kendall: rank correlation (more robust with small samples or ties)

# Display the correlation matrices
print("\nPearson Correlation:")
print(pearson_corr)

print("\nSpearman Correlation:")
print(spearman_corr)

print("\nKendall Correlation:")
print(kendall_corr)

plt.figure(figsize=(6, 4))
sns.heatmap(pearson_corr, annot=True, cmap="coolwarm", center=0)

# Add title
plt.title("Pearson Correlation Heatmap")

# Show plot
plt.show()


###  5.4.3 Linear regression model

In [None]:
# Choose a specific cwe_code to predict
target_cwe_code = 79  # Improper Neutralization of Input During Web Page Generation ('Cross-site Scripting')

# Create a binary target variable (1 if the cwe_code is the target, 0 otherwise)
df['target'] = (df['cwe_code'] == target_cwe_code).astype(int)

# Drop rows with missing values in 'year'
df_clean = df.dropna(subset=['year']).copy() # Added .copy() to avoid SettingWithCopyWarning
df_clean['year'] = pd.to_datetime(df_clean['year'])
# Convert datetime to numeric for regression (using seconds since epoch)

df_clean['year_numeric'] = (df_clean['year'] - df_clean['year'].min()).dt.total_seconds()

# Define features and target
X = df_clean[['year_numeric']]
y = df_clean['target']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict values for the test set
y_pred = model.predict(X_test)

# Evaluate model performance
print("\nModel Evaluation:")
 # The constant term (β₀)
print("Intercept:", model.intercept_)
# The slope (β₁) for rolling_mean
print("Coefficient:", model.coef_[0])
# Proportion of variance explained
print("R-squared:", r2_score(y_test, y_pred))
# Average squared error
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

plt.figure(figsize=(8, 5))  # Set the figure size

# Scatter plot of actual test data
plt.scatter(y_test, y_pred, label='Actual vs. Predicted', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],'k--', label='Ideal Prediction (y=x)')

# Add labels and title
plt.xlabel("Actual target (Is CWE 79?)")
plt.ylabel("Predicted target probability")
plt.title("Linear Regression: Actual vs. Predicted Target")

# Add legend and show the plot
plt.legend()
plt.grid(True) # Add grid for better readability
plt.show()

In [None]:
# Recommendations:
# 
# Prioritize Remediation: Focus on addressing the highest-risk categories, especially injection flaws and broken access control.
# Secure Coding Practices: Implement secure coding practices to prevent these vulnerabilities from being introduced in the first place.
# Regular Security Testing: Conduct regular security testing (e.g., penetration testing, static analysis) to identify and address vulnerabilities early.
# Security Training: Provide security training to developers to raise awareness and improve secure coding skills.

###  5.4.3 Linear regression model

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc # Import classification metrics

# Choose a specific cwe_code to predict
target_cwe_code = 79  # Improper Neutralization of Input During Web Page Generation ('Cross-site Scripting')

# Create a binary target variable (1 if the cwe_code is the target, 0 otherwise)
# outcome we want to predict
df['target'] = (df['cwe_code'] == target_cwe_code).astype(int)

# Drop rows with missing values in 'year'
# df_clean = df.dropna(subset=['year']).copy() # Added .copy() to avoid SettingWithCopyWarning

# Ensure the 'year' column is datetime type
# df_clean['year'] = pd.to_datetime(df_clean['year'])

# Convert datetime to numeric for regression (using seconds since epoch)
df_clean['year_numeric'] = (df_clean['year'] - df_clean['year'].min()).dt.total_seconds()


# Define features and target
X = df_clean[['year_numeric']]
y = df_clean['target']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print(f"Proportion of target=1 in training set: {y_train.mean():.4f}")
print(f"Proportion of target=1 in testing set: {y_test.mean():.4f}")


# Initialize and fit a Logistic Regression model
model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(X_train, y_train)

print("\n--- Model Training Complete ---")
# Print model coefficients and intercept for interpretation
print(f"Model coefficient (for year_numeric): {model.coef_[0][0]:.4f}")
print(f"Model intercept: {model.intercept_[0]:.4f}")


# Predict class labels for the test set
y_pred_class = model.predict(X_test)

# Predict probabilities of the positive class (CWE 79) for the test set
y_pred_proba = model.predict_proba(X_test)[:, 1]


# Evaluate model performance using classification metrics
print("\n--- Model Evaluation (Logistic Regression) ---")

# Accuracy: Overall proportion of correct predictions
accuracy = accuracy_score(y_test, y_pred_class)
print(f"Accuracy: {accuracy:.4f}")

# Confusion Matrix: Breakdown of correct and incorrect predictions per class
conf_matrix = confusion_matrix(y_test, y_pred_class)
print("\nConfusion Matrix:")
print(conf_matrix)

# Classification Report: Provides Precision, Recall, F1-Score for each class
class_report = classification_report(y_test, y_pred_class, zero_division=0)
print("\nClassification Report:")
print(class_report)

plt.figure(figsize=(10, 6))

# Plot actual points (jittered on the y-axis for better visualization if many points overlap)

plt.scatter(X_test['year_numeric'], y_test + np.random.uniform(-0.05, 0.05, size=y_test.shape),
            color='grey', label='Actual (0=No, 1=Yes)', alpha=0.6)

# Sort the test data by year_numeric to plot a smooth probability curve
X_test_sorted = X_test.sort_values(by='year_numeric')
y_pred_proba_sorted = model.predict_proba(X_test_sorted)[:, 1]

# Plot the predicted probability curve from the Logistic Regression model
plt.plot(X_test_sorted['year_numeric'], y_pred_proba_sorted, color='red', label='Predicted Probability (Logistic Regression)')

# Add labels and title
plt.xlabel("Year (Numeric - Seconds since earliest year)")
plt.ylabel("Predicted Probability of being CWE 79")
plt.title("Logistic Regression: Predicted Probability of CWE 79 over Time")

# Add legend and show the plot
plt.legend()
plt.grid(True)
plt.show()


fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') # Diagonal line representing random chance
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()