In [None]:
import csv
import re
import pandas as pd
from ydata_profiling import ProfileReport

In [2]:
def analyze_cve_data(filename):
    """
    Analyze a CVE data file and generate a profile report
    
    Args:
        filename (str): Path to the CVE data file
    
    Returns:
        pandas.DataFrame: The loaded dataframe
        str: Path to the generated report file
    """
    print(f"\n===== Analyzing file: {filename} =====")
    
    # Verify file exists
    if not os.path.exists(filename):
        print(f"Error: File '{filename}' not found")
        return None, None
    
    try:
        # Read the CSV file with tab delimiter
        df = pd.read_csv(filename, sep='\t')
        
        # Display basic information about the dataset
        print("Dataset shape:", df.shape)
        print("\nFirst 3 rows of the dataset:")
        print(df.head(3))
        
        # Generate output filename based on input filename
        output_filename = "./reports/" + os.path.splitext(os.path.basename(filename))[0] + "_profile_report.html"
        
        # Create a profile report with custom title
        print("\nGenerating profile report...")
        profile = ProfileReport(
            df, 
            title=f"CVE-CWE Mapping Analysis Report: {os.path.basename(filename)}",
            explorative=True,  # Enable detailed mode for deeper analysis
            minimal=False,     # Don't use minimal report
            sensitive=False    # Don't consider data as sensitive/confidential
        )
        
        # Save the report to HTML
        profile.to_file(output_filename)
        
        return df, output_filename
        
    except Exception as e:
        print(f"Error processing file: {e}")
        return None, None

In [3]:
# Define the filenames to analyze
file1 = "data_in/cti-rcm.tsv"  # 2024
file2 = "data_in/cti-rcm-2021.tsv"  # 2021

# Analyze each file
df1, report1 = analyze_cve_data(file1)
df2, report2 = analyze_cve_data(file2)


===== Analyzing file: data_in/cti-rcm.tsv =====
Dataset shape: (1000, 4)

First 3 rows of the dataset:
                                               URL  \
0  https://nvd.nist.gov/vuln/detail/CVE-2024-23848   
1  https://nvd.nist.gov/vuln/detail/CVE-2023-38738   
2  https://nvd.nist.gov/vuln/detail/CVE-2024-22137   

                                         Description  \
0  In the Linux kernel through 6.7.1, there is a ...   
1  IBM OpenPages with Watson 8.3 and 9.0 could pr...   
2  Improper Neutralization of Input During Web Pa...   

                                              Prompt       GT  
0  Analyze the following CVE description and map ...  CWE-416  
1  Analyze the following CVE description and map ...  CWE-257  
2  Analyze the following CVE description and map ...   CWE-79  

Generating profile report...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


===== Analyzing file: data_in/cti-rcm-2021.tsv =====
Dataset shape: (1000, 4)

First 3 rows of the dataset:
                                               URL  \
0  https://nvd.nist.gov/vuln/detail/CVE-2021-36335   
1  https://nvd.nist.gov/vuln/detail/CVE-2021-33726   
2  https://nvd.nist.gov/vuln/detail/CVE-2021-38681   

                                         Description  \
0  Dell EMC CloudLink 7.1 and all prior versions ...   
1  A vulnerability has been identified in SINEC N...   
2  A reflected cross-site scripting (XSS) vulnera...   

                                              Prompt      GT  
0  Analyze the following CVE description and map ...  CWE-20  
1  Analyze the following CVE description and map ...  CWE-22  
2  Analyze the following CVE description and map ...  CWE-79  

Generating profile report...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]