In [5]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('vulnerabilities.csv')

# Calculate basic statistics
total_vulnerabilities = len(df)
unique_cves = df['VULNERABILITY_CVE'].nunique()
unique_cwes = df['VULNERABILITY_CWE'].nunique()
most_common_cwe = df['VULNERABILITY_CWE'].value_counts().idxmax()
year_min = df['VULNERABILITY_YEAR'].min()
year_max = df['VULNERABILITY_YEAR'].max()
year_range = f"{year_min:.0f}-{year_max:.0f}"

# Calculate metrics for files and functions changed
avg_files_changed = df['NUM_FILES_CHANGED'].mean()
median_files_changed = df['NUM_FILES_CHANGED'].median()
avg_functions_changed = df['NUM_FUNCTIONS_CHANGED'].mean()
median_functions_changed = df['NUM_FUNCTIONS_CHANGED'].median()

# Calculate metrics for lines added and deleted
avg_lines_added = df['NUM_LINES_ADDED'].mean()
median_lines_added = df['NUM_LINES_ADDED'].median()
avg_lines_deleted = df['NUM_LINES_DELETED'].mean()
median_lines_deleted = df['NUM_LINES_DELETED'].median()

# Calculate average number of lines in vulnerable and patched code blocks
# First, count the number of lines in each code block
df['VULNERABLE_CODE_LINES'] = df['VULNERABLE_CODE_BLOCK'].str.count('\n') + 1
df['PATCHED_CODE_LINES'] = df['PATCHED_CODE_BLOCK'].str.count('\n') + 1

# Calculate the average
avg_vulnerable_lines = df['VULNERABLE_CODE_LINES'].mean()
avg_patched_lines = df['PATCHED_CODE_LINES'].mean()

# Create a summary dataframe
summary = pd.DataFrame({
    'Metric': [
        'Total Vulnerabilities', 
        'Unique CVEs', 
        'Unique CWEs',
        'Most Common CWE',
        'Year Range',
        'Avg. Files Changed',
        'Median Files Changed',
        'Avg. Functions Changed',
        'Median Functions Changed',
        'Avg. Lines Added',
        'Median Lines Added',
        'Avg. Lines Deleted', 
        'Median Lines Deleted',
        'Avg. Lines in Vulnerable Code',
        'Avg. Lines in Patched Code'
    ],
    'Value': [
        total_vulnerabilities,
        unique_cves,
        unique_cwes,
        most_common_cwe,
        year_range,
        round(avg_files_changed, 2),
        median_files_changed,
        round(avg_functions_changed, 2),
        median_functions_changed,
        round(avg_lines_added, 2),
        median_lines_added,
        round(avg_lines_deleted, 2),
        median_lines_deleted,
        round(avg_vulnerable_lines, 2),
        round(avg_patched_lines, 2)
    ]
})

# Display the summary table
display(summary)

# Additional analysis of CWE distribution
cwe_counts = df['VULNERABILITY_CWE'].value_counts()
display(pd.DataFrame({
    'CWE': cwe_counts.index,
    'Count': cwe_counts.values,
    'Percentage': round(cwe_counts.values / total_vulnerabilities * 100, 2)
}))

# Create histograms for better visualization
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
df['NUM_FILES_CHANGED'].hist(bins=15)
plt.title('Distribution of Files Changed')
plt.xlabel('Number of Files')
plt.ylabel('Frequency')

plt.subplot(2, 2, 2)
df['NUM_FUNCTIONS_CHANGED'].hist(bins=15)
plt.title('Distribution of Functions Changed')
plt.xlabel('Number of Functions')
plt.ylabel('Frequency')

plt.subplot(2, 2, 3)
df['NUM_LINES_ADDED'].hist(bins=15)
plt.title('Distribution of Lines Added')
plt.xlabel('Number of Lines')
plt.ylabel('Frequency')

plt.subplot(2, 2, 4)
df['NUM_LINES_DELETED'].hist(bins=15)
plt.title('Distribution of Lines Deleted')
plt.xlabel('Number of Lines')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Analyze yearly trends
yearly_counts = df.groupby(df['VULNERABILITY_YEAR'].astype(int))['id'].count()
plt.figure(figsize=(12, 6))
yearly_counts.plot(kind='bar')
plt.title('Vulnerabilities by Year')
plt.xlabel('Year')
plt.ylabel('Number of Vulnerabilities')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Add visualization for code block sizes
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
df['VULNERABLE_CODE_LINES'].hist(bins=15)
plt.title('Size of Vulnerable Code Blocks')
plt.xlabel('Number of Lines')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
df['PATCHED_CODE_LINES'].hist(bins=15)
plt.title('Size of Patched Code Blocks')
plt.xlabel('Number of Lines')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()



