# Data ingestion by website

In [None]:
import os
import zipfile
import pandas as pd

# Clone the repository
!git clone https://github.com/justakazh/CVE_Database.git

# Change directory
os.chdir('CVE_Database-main')

# List files
files = os.listdir()
print("Files in repository:", files)

# Extract zip files
if not os.path.exists('extracted_files'):
    os.mkdir('extracted_files')

for file in files:
    if file.endswith('.zip'):
        with zipfile.ZipFile(file, 'r') as zip_ref:
            zip_ref.extractall('extracted_files')

# List extracted files
extracted_files = os.listdir('extracted_files')
print("Extracted files:", extracted_files)



Data Extraction 

In [None]:
import os
import json
import csv
import time

# Define the root folder path for JSON files
root_folder_path = r'E:\NLP\CVE_Database-main\CVE_Database-main'

# CSV file to store extracted data
output_csv_path = 'extracted_data_cve_key_data.csv'

# Function to extract key CVE data from a JSON file
def extract_cve_data(json_file_path):
    try:
        with open(json_file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)

            # Extract CVE ID
            cve_id = data.get('cve', {}).get('id', 'N/A')

            # Extract Source Identifier
            source_identifier = data.get('cve', {}).get('sourceIdentifier', 'N/A')

            # Extract Published Date
            published_date = data.get('cve', {}).get('published', 'N/A')

            # Extract Last Modified Date
            last_modified_date = data.get('cve', {}).get('lastModified', 'N/A')

            # Extract Vulnerability Status
            vuln_status = data.get('cve', {}).get('vulnStatus', 'N/A')

            # Extract English description
            description = next(
                (desc.get('value', 'N/A') for desc in data.get('cve', {}).get('descriptions', []) if desc.get('lang') == 'en'), 
                'N/A'
            )

            # Extract CVSS Base Score (if available)
            cvss_score = 'N/A'
            cvss_metrics = data.get('cve', {}).get('metrics', {}).get('cvssMetricV2', [])
            if cvss_metrics:
                cvss_score = cvss_metrics[0].get('cvssData', {}).get('baseScore', 'N/A')

            # Extract Weaknesses (CWE)
            weaknesses = [
                weak.get('description', [{}])[0].get('value', 'N/A') for weak in data.get('cve', {}).get('weaknesses', [])
            ]
            weaknesses_str = "; ".join(weaknesses) if weaknesses else 'N/A'

            # Extract Configuration (CPE Match Criteria)
            configurations = []
            configurations_data = data.get('cve', {}).get('configurations', [])

            # Check if configurations_data is a list
            if isinstance(configurations_data, list):
                for config in configurations_data:
                    for node in config.get('nodes', []):
                        for cpe_match in node.get('cpeMatch', []):
                            if cpe_match.get('vulnerable', False):
                                configurations.append(cpe_match.get('criteria', 'N/A'))

            configurations_str = "; ".join(configurations) if configurations else 'N/A'

            # Extract References (URLs)
            references = [ref.get('url') for ref in data.get('cve', {}).get('references', [])]
            references_str = "; ".join(references) if references else 'N/A'

            return (cve_id, source_identifier, published_date, last_modified_date, vuln_status, description, 
                    cvss_score, weaknesses_str, configurations_str, references_str)

    except (json.JSONDecodeError, FileNotFoundError, KeyError) as e:
        print(f"Error reading {json_file_path}: {e}")
        return None

# Function to process the folder and extract data to CSV
def process_folder_to_csv(root_folder_path, output_csv_path):
    # Open CSV file to write
    with open(output_csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        
        # Write header row with all key fields
        csv_writer.writerow(['CVE ID', 'Source Identifier', 'Published Date', 'Last Modified Date', 'Vulnerability Status', 
                             'Description', 'CVSS Score', 'Weaknesses', 'Configuration', 'References'])

        # Walk through all directories and files in the dataset
        for foldername, subfolders, filenames in os.walk(root_folder_path):
            print(f"Processing folder: {foldername}")
            
            for filename in filenames:
                if filename.endswith('.json'):  # Only process JSON files
                    json_file_path = os.path.join(foldername, filename)
                    print(f"Processing file: {json_file_path}")  # Debugging - check file path

                    # Extract data from the JSON file
                    extracted_data = extract_cve_data(json_file_path)

                    # Write extracted data to CSV if extraction was successful
                    if extracted_data:
                        csv_writer.writerow(extracted_data)
                    else:
                        print(f"Failed to extract data from file: {json_file_path}")
            
            # Sleep to prevent overloading the system if large files are being processed
            time.sleep(0.1)  # Adjust the sleep time as per system capacity

# Call the function to process the folder and save the data to CSV
process_folder_to_csv(root_folder_path, output_csv_path)

print(f"Data extraction completed. Output saved to {output_csv_path}")
