In [41]:
%pip install pandas numpy plotly kagglehub

Note: you may need to restart the kernel to use updated packages.


## 📌 1. Loading the Dataset  

In [42]:
import pandas as pd
import numpy as np
import plotly.express as px
import kagglehub
from os.path import join as path_join

In [43]:
data_root = kagglehub.dataset_download("andrewkronser/cve-common-vulnerabilities-and-exposures")



>## 💡 **Interpretation**: 
-   **mod_date: The date the entry was last modified.**
-  **pub_date: The date the entry was published.**
-  **cvss: Common Vulnerability Scoring System (CVSS) score, a measure of the severity of a vulnerability.**
-  **cwe_code: Common Weakness Enumeration (CWE) code, identifying the type of weakness.**
-  **cwe_name: The name associated with the CWE code.**
-  **summary: A text summary of the vulnerability.**
-  **access_authentication.**
-  **access_complexity: how difficult it is to execute.**
-  **access_vector: how the attack is performed, aka via network or locally.**

In [44]:
df = pd.read_csv(path_join(data_root, 'cve.csv'), header=0, index_col=0)
df.mod_date = pd.to_datetime(df.mod_date)
df.pub_date = pd.to_datetime(df.pub_date)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89660 entries, CVE-2019-16548 to CVE-2007-3004
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   mod_date                89660 non-null  datetime64[ns]
 1   pub_date                89660 non-null  datetime64[ns]
 2   cvss                    89660 non-null  float64       
 3   cwe_code                89660 non-null  int64         
 4   cwe_name                89660 non-null  object        
 5   summary                 89660 non-null  object        
 6   access_authentication   88776 non-null  object        
 7   access_complexity       88776 non-null  object        
 8   access_vector           88776 non-null  object        
 9   impact_availability     88776 non-null  object        
 10  impact_confidentiality  88776 non-null  object        
 11  impact_integrity        88776 non-null  object        
dtypes: datetime64[ns](2), float64(

## 🧼 2. Handling Missing Data

In [45]:
missing_counts = df.isnull().sum()
print("Missing Data Count:\n", missing_counts)

Missing Data Count:
 mod_date                    0
pub_date                    0
cvss                        0
cwe_code                    0
cwe_name                    0
summary                     0
access_authentication     884
access_complexity         884
access_vector             884
impact_availability       884
impact_confidentiality    884
impact_integrity          884
dtype: int64


## **3. Probability Distribution & Descriptive Stats**

### 🎯 Expected Value (Mean)

In [46]:
from scipy import stats

# Convert the 'access_complexity' column to a list
access_complexity_list = df['access_complexity'].tolist() 

# Unique access_complexity
unique_access_complexity = list(set(access_complexity_list))
print("\nUnique access_complexity:", unique_access_complexity)

# Unique impact_availability
access_impact_availability = df['impact_availability'].tolist() 
unique_impact_availability = list(set(access_impact_availability))
print("\nUnique impact_availability:", unique_impact_availability)

# Unique impact_confidentiality
access_impact_confidentiality = df['impact_confidentiality'].tolist() 
unique_impact_confidentiality = list(set(access_impact_confidentiality))
print("\nUnique impact_confidentiality:", unique_impact_confidentiality)

# Unique impact_integrity
access_impact_integrity  = df['impact_integrity'].tolist() 
unique_impact_integrity  = list(set(access_impact_integrity ))
print("\nUnique impact_integrity:", unique_impact_integrity )

# Min and Max of DataFrame columns
print("Minimum values in each column:")
print(df.min(numeric_only=True))

print("\nMaximum values in each column:")
print(df.max(numeric_only=True))

# Geometric Mean
# Calculate the geometric mean of cvss
geometric_mean_cvss = stats.gmean(df["cvss"].dropna())
print(f"\nGeometric Mean of cvss: {geometric_mean_cvss:.2f}")

# Calculate the geometric mean of cwe_code
geometric_mean_cwe_code = stats.gmean(df["cwe_code"].dropna())
print(f"\nGeometric Mean of cwe_code: {geometric_mean_cwe_code:.2f}")

from scipy import stats

# Min and Max of DataFrame columns
print("Minimum values in each column:")
print(df.min(numeric_only=True))

print("\nMaximum values in each column:")
print(df.max(numeric_only=True))



# Geometric Mean
# Calculate the geometric mean of cvss
geometric_mean_cvss = stats.gmean(df["cvss"].dropna())
print(f"\nGeometric Mean of cvss: {geometric_mean_cvss:.2f}")





Unique access_complexity: [nan, 'HIGH', 'LOW', 'MEDIUM']

Unique impact_availability: ['PARTIAL', nan, 'NONE', 'COMPLETE']

Unique impact_confidentiality: ['PARTIAL', nan, 'COMPLETE', 'NONE']

Unique impact_integrity: ['PARTIAL', nan, 'NONE', 'COMPLETE']
Minimum values in each column:
cvss        0.0
cwe_code    1.0
dtype: float64

Maximum values in each column:
cvss          10.0
cwe_code    1188.0
dtype: float64

Geometric Mean of cvss: 0.00

Geometric Mean of cwe_code: 134.06
Minimum values in each column:
cvss        0.0
cwe_code    1.0
dtype: float64

Maximum values in each column:
cvss          10.0
cwe_code    1188.0
dtype: float64

Geometric Mean of cvss: 0.00


### 📈 Variance and Standard Deviation





### 📊 Distribution Shapes

## 📊 Visualizing Relationships
