# Assignment 6.2: Preparing Data for Final Team Project 
## Group 2:
- ***Dipraj Bista***
- ***Landon Padgett***
- ***Ghassan Seba***

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## Establishing Text Cleaning and Tokenization Functions for SEC Data
*This code defines functions to clean, tokenize, and lemmatize SEC filing text, removing boilerplate and irrelevant content. These functions form a preprocessing pipeline to prepare text for analysis.*

In [2]:
# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)

# Load the English stopwords from NLTK
stopwords_list = stopwords.words('english')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the stopwords, including custom stopwords specific to the domain
custom_stopwords = set(stopwords.words('english')).union({
    'herein', 'thereof', 'aforementioned', 'form', '8-k', 'nvidia', 'corporation', 'securities', 'exchange',
    'commission', 'share', 'meeting', 'vote', 'registrant', 'item', 'section', 'pursuant', 'act', 'inc', 'llc'
})

def clean_text(text):
    """Performs enhanced text cleaning to remove legal boilerplate and irrelevant characters."""
    # Remove common boilerplate phrases and legal text patterns
    text = re.sub(r'UNITED STATES SECURITIES AND EXCHANGE COMMISSION', '', text, flags=re.IGNORECASE)
    text = re.sub(r'FORM 8-K', '', text, flags=re.IGNORECASE)
    text = re.sub(r'CURRENT REPORT PURSUANT TO SECTION [0-9]+ OF THE SECURITIES EXCHANGE ACT OF 1934', '', text, flags=re.IGNORECASE)

    # Add spaces between numbers and words (ensures proper spacing in cases like 'for398number')
    text = re.sub(r'(?<=[a-zA-Z])(?=\d)', ' ', text)
    text = re.sub(r'(?<=\d)(?=[a-zA-Z])', ' ', text)
    
    # Normalize dates and numbers
    text = re.sub(r'\b\d{4}-\d{2}-\d{2}\b', ' [DATE] ', text)
    text = re.sub(r'\b\d+\b', '', text)
    
    # Remove unwanted characters and special symbols
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\\x\w{2}', '', text)
    text = re.sub(r'[☐]', '', text)
    
    # Remove punctuation, while keeping periods for potential sentence segmentation
    text = re.sub(r'[^\w\s\.\-]', '', text)
    
    # Remove single characters followed by a period (e.g., 'a.', 'm.', 'c.')
    text = re.sub(r'\b[a-zA-Z]\.\b', '', text)
    
    return text

def tokenize(text):
    """Tokenizes, lemmatizes, and removes stopwords, including filtering out single characters."""
    tokens = re.split(r'\s+', text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in custom_stopwords and len(token) > 1]
    return tokens

def prepare(text, pipeline):
    """Prepares text by applying a sequence of cleaning and transformation steps."""
    for transform in pipeline:
        text = transform(text)
    return text

# Define the text preprocessing pipeline
pipeline = [clean_text, tokenize]

# Function to preprocess the 'Disclosure Text' column in the existing DataFrame
def preprocess_disclosure_text(df):
    """Applies text preprocessing to the 'Disclosure Text' column of the existing DataFrame."""
    df['Processed Disclosure Text'] = df['Disclosure Text'].apply(
        lambda x: ' '.join(prepare(x, pipeline)) if isinstance(x, str) else ''
    )
    return df


## Functions for Extracting and Generating URLs for SEC Form 8-K Filings
*This code defines functions to extract data from HTML tables, parse SEC filing content, and generate URLs for Form 8-K filings. It processes HTML elements to extract disclosure items, converts them into structured text, and constructs URLs for accessing specific filings using the SEC's index file.*

In [3]:
# Extract data from HTML table and format it as a string
def extract_table_data(table):
    rows = table.find_all('tr')
    table_text = [' | '.join(cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])) for row in rows]
    return '\n'.join(table_text)

# Extract disclosure items and tables from HTML content
def extract_nvda_disclosures_xml(soup):
    items_content = {}
    current_item = None
    div_elements = soup.find_all(['div', 'table'])

    for element in div_elements:
        if element.name == 'div':
            text = element.get_text(strip=True)
            if "Item" in text:
                current_item = text
                items_content[current_item] = []
            elif current_item:
                items_content[current_item].append(text)
        elif element.name == 'table':  # Process table data
            table_data = extract_table_data(element)
            if current_item:
                items_content[current_item].append(table_data)

    for item, content_list in items_content.items():
        items_content[item] = ' '.join(content_list)

    return items_content

# Generate Form 8-K filing URLs using the SEC's index.json file
def generate_form_8k_urls(cik_number, accession_numbers):
    filing_urls = []
    base_url = f'https://www.sec.gov/Archives/edgar/data/{int(cik_number)}/'
    headers = {"User-Agent": "MyDataRetriever/1.0 (Contact: your_email@example.com)"}

    for accession_number in accession_numbers:
        formatted_accession = accession_number.replace('-', '')
        index_url = f"{base_url}{formatted_accession}/index.json"
        response = requests.get(index_url, headers=headers)
        time.sleep(1)  # Rate limiting

        if response.status_code == 200:
            try:
                data = response.json()
                for file_info in data.get('directory', {}).get('item', []):
                    if file_info['name'].lower().endswith('.htm') and '8-k' in file_info['name'].lower():
                        filing_urls.append(f"{base_url}{formatted_accession}/{file_info['name']}")
            except ValueError:
                print(f"Error parsing JSON at {index_url}")
        else:
            print(f"Failed to retrieve index for {accession_number}, status code {response.status_code}")

    return filing_urls

## Retrieving and Formatting Company Data from the SEC Edgar Database

*This code retrieves company ticker data from the SEC Edgar database, processes it into a pandas DataFrame, and formats CIK numbers with leading zeros for URL compatibility. The resulting DataFrame is then displayed.*

In [4]:
# Define Base URL
base_url = r"https://www.sec.gov/Archives/edgar/data"

# Set up the headers with User-Agent
headers = {"User-Agent": "MyDataRetriever/1.0 (Contact: gseba@sandiego.edu)"}

# Get all companies data
tickers_json = requests.get("https://www.sec.gov/files/company_tickers.json", headers=headers)

In [5]:
# Format response as a dictionary and get first key/value
frist_entry = tickers_json.json()['0']

# Get CIK Numbers 
raw_cik = tickers_json.json()['0']['cik_str']

# Convert to a data frame
cik_df = pd.DataFrame.from_dict(tickers_json.json(), orient='index')

# Add leading zeros to CIK numbers for URL formatting
cik_df['cik_str'] = cik_df['cik_str'].astype(str).str.zfill(10)

# Display Data Frame
cik_df

Unnamed: 0,cik_str,ticker,title
0,0000320193,AAPL,Apple Inc.
1,0000789019,MSFT,MICROSOFT CORP
2,0001045810,NVDA,NVIDIA CORP
3,0001652044,GOOGL,Alphabet Inc.
4,0001018724,AMZN,AMAZON COM INC
...,...,...,...
10162,0001849820,KITTW,"Nauticus Robotics, Inc."
10163,0001276187,ET-PI,Energy Transfer LP
10164,0001571283,REXR-PC,"Rexford Industrial Realty, Inc."
10165,0001855756,LILMW,Lilium N.V.


## Automating SEC Filing Data Retrieval and URL Generation
*This code retrieves and processes company filing metadata from the SEC Edgar database using a CIK number to generate URLs for specific filings. It organizes the data into a DataFrame, identifies filing types, and filters for '8-K' reports, displaying accession numbers to ensure accurate file access via URL links.*

In [6]:
# Find CIK number using a company's ticker symbol
def get_cik_number_by_ticker(df, ticker_symbol):
    """
    Parameters:
    df (pd.DataFrame): DataFrame with 'ticker' and 'cik_str' columns.
    ticker_symbol (str): Company's ticker symbol.

    Returns:
    str: CIK number if the ticker symbol is found, or a message if not found.
    """
    result = df.loc[df['ticker'] == ticker_symbol, 'cik_str']  # Filter by ticker symbol
    
    if not result.empty:
        cik_number = result.iloc[0]  # Extract first matching CIK number
        return cik_number
    else:
        return f"Ticker symbol '{ticker_symbol}' not found in the dataset."

# Search for CIK number using ticker
ticker_to_search = 'NVDA'
cik_number = get_cik_number_by_ticker(cik_df, ticker_to_search)

# Retrieve company filing metadata using CIK number
def get_filing_metadata(cik_number):
    """
    Parameters:
    cik_number (str): Company's CIK number.

    Returns:
    dict or None: Parsed JSON data if successful, otherwise None.
    """
    url = f'https://data.sec.gov/submissions/CIK{cik_number}.json'
    response = requests.get(url, headers=headers)

    # Check response status before parsing JSON
    if response.status_code == 200:
        try:
            data = response.json()
            print("Successfully retrieved JSON data.")
            return data
        except ValueError as e:
            print("Error parsing JSON:", e)
            print("Response Content (Preview):", response.text[:500])
            return None
    else:
        print(f"Failed to retrieve data. Status Code: {response.status_code}")
        print("Response Content (Preview):", response.text[:500])
        return None

# Retrieve filing metadata
filing_metadata = get_filing_metadata(cik_number)

# Process filing metadata if retrieved successfully
if filing_metadata:
    print(filing_metadata.keys())  # Display top-level keys in JSON
    filings = filing_metadata.get('filings', {}).get('recent', None)

    if filings:
        # Create DataFrame from recent filings
        forms_df = pd.DataFrame.from_dict(filings)
        print(forms_df.columns)  # Display DataFrame columns
    else:
        print("No recent filings found in JSON data.")


Successfully retrieved JSON data.
dict_keys(['cik', 'entityType', 'sic', 'sicDescription', 'ownerOrg', 'insiderTransactionForOwnerExists', 'insiderTransactionForIssuerExists', 'name', 'tickers', 'exchanges', 'ein', 'description', 'website', 'investorWebsite', 'category', 'fiscalYearEnd', 'stateOfIncorporation', 'stateOfIncorporationDescription', 'addresses', 'phone', 'flags', 'formerNames', 'filings'])
Index(['accessionNumber', 'filingDate', 'reportDate', 'acceptanceDateTime',
       'act', 'form', 'fileNumber', 'filmNumber', 'items', 'core_type', 'size',
       'isXBRL', 'isInlineXBRL', 'primaryDocument', 'primaryDocDescription'],
      dtype='object')


In [7]:
# Explore resulting data frame
forms_df

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,core_type,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,0001045810-24-000296,2024-10-07,2024-10-03,2024-10-07T17:27:24.000Z,,4,,,,4,6816,0,0,xslF345X05/wk-form4_1728336437.xml,FORM 4
1,0001045810-24-000294,2024-10-01,2024-09-27,2024-10-01T16:19:18.000Z,,4,,,,4,6815,0,0,xslF345X05/wk-form4_1727813951.xml,FORM 4
2,0001045810-24-000292,2024-09-26,2024-09-24,2024-09-26T17:22:24.000Z,,4,,,,4,6818,0,0,xslF345X05/wk-form4_1727385738.xml,FORM 4
3,0001961863-24-000324,2024-09-24,,2024-09-24T17:58:48.000Z,33,144,000-23985,241320989,,144,247027,0,0,xsl144X01/primary_doc.xml,
4,0001045810-24-000290,2024-09-24,2024-09-20,2024-09-24T16:12:39.000Z,,4,,,,4,10129,0,0,xslF345X05/wk-form4_1727208751.xml,FORM 4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,0001045810-16-000296,2016-08-17,2016-08-15,2016-08-17T16:05:26.000Z,,4,,,,4,5087,0,0,xslF345X03/wf-form4_147146431382883.xml,FORM 4
997,0001045810-16-000294,2016-08-12,2016-08-10,2016-08-12T16:11:12.000Z,,4,,,,4,14123,0,0,xslF345X03/wf-form4_147103265663155.xml,FORM 4
998,0001045810-16-000292,2016-08-11,2016-08-11,2016-08-11T16:22:20.000Z,34,8-K,000-23985,161824893,"2.02,9.01",8-K,843096,0,0,form8-kq2fy17.htm,FORM 8-K
999,0001045810-16-000288,2016-07-15,2016-07-13,2016-07-15T16:43:32.000Z,,4,,,,4,14124,0,0,xslF345X03/wf-form4_146861539906570.xml,FORM 4


In [8]:
# Explore the unique values in the 'form' column
forms_df['form'].unique()

array(['4', '144', '10-Q', '8-K', '13F-HR', '3', 'SC 13G', 'SD', 'ARS',
       'DEFA14A', 'DEF 14A', '10-K', 'SC 13G/A', '4/A', 'UPLOAD',
       'CORRESP', '144/A', 'EFFECT', 'S-3/A', 'S-3', 'S-8', 'PRE 14A',
       '5', '424B5', 'FWP', '5/A', '8-K/A', 'S-3ASR', 'DFAN14A'],
      dtype=object)

In [9]:
# Filter the DataFrame for the '8-K' form
eight_k_filings = forms_df[forms_df['form'] == '8-K']

# Check if there are any '8-K' reports and display the accession numbers
if not eight_k_filings.empty:
    accession_numbers = eight_k_filings['accessionNumber'].tolist()
    print("Accession numbers for '8-K' reports:", accession_numbers)
else:
    print("No '8-K' reports found for the selected CIK number.")

Accession numbers for '8-K' reports: ['0001045810-24-000262', '0001045810-24-000206', '0001045810-24-000144', '0001045810-24-000113', '0001045810-24-000069', '0001045810-24-000028', '0001045810-23-000225', '0001045810-23-000221', '0001045810-23-000217', '0001045810-23-000171', '0001045810-23-000164', '0001045810-23-000146', '0001045810-23-000087', '0001045810-23-000039', '0001045810-23-000014', '0001045810-22-000163', '0001045810-22-000151', '0001045810-22-000146', '0001045810-22-000136', '0001045810-22-000133', '0001045810-22-000088', '0001045810-22-000073', '0001045810-22-000023', '0001045810-22-000008', '0001045810-22-000005', '0001045810-21-000160', '0001045810-21-000128', '0001045810-21-000117', '0001193125-21-192149', '0001045810-21-000075', '0001045810-21-000063', '0001045810-21-000056', '0001045810-21-000047', '0001045810-21-000034', '0001045810-21-000007', '0001045810-20-000187', '0001045810-20-000181', '0001193125-20-244601', '0001045810-20-000145', '0001045810-20-000127', '0

## Scraping and Processing SEC Form 8-K Filings from the EDGAR Site
*This code scrapes Form 8-K filings for a given company using its ticker symbol from the SEC EDGAR site. It generates URLs for the filings, retrieves the content, and extracts disclosure items using BeautifulSoup. The scraped data, including CIK numbers, accession numbers, and disclosure text, is then organized into a DataFrame for analysis.*

In [10]:
# Process filings
ticker_symbol = 'NVDA'
cik_number = get_cik_number_by_ticker(cik_df, ticker_symbol)
filing_metadata = get_filing_metadata(cik_number)

if filing_metadata:
    filings = filing_metadata.get('filings', {}).get('recent', None)

    if filings:
        forms_df = pd.DataFrame.from_dict(filings)
        eight_k_filings = forms_df[forms_df['form'] == '8-K']

        if not eight_k_filings.empty:
            accession_numbers = eight_k_filings['accessionNumber'].tolist()
            filing_urls = generate_form_8k_urls(cik_number, accession_numbers)

            headers = {"User-Agent": "MyDataRetriever/1.0 (Contact: your_email@example.com)"}
            for url in filing_urls:
                response = requests.get(url, headers=headers)
                time.sleep(1)

                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    disclosure = extract_nvda_disclosures_xml(soup)
                    print(f"{ticker_symbol} Form 8-K Disclosures:")
                    for item, content in disclosure.items():
                        print(f"{item}: {content}\n")
                else:
                    print(f"Failed to fetch content for {url}")

            # Print separator for better readability
            print("\n" + "-" * 40 + "\n")
        else:
            print("No '8-K' reports found.")
    else:
        print("No recent filings found.")
else:
    print("Failed to retrieve filing metadata.")


Successfully retrieved JSON data.
NVDA Form 8-K Disclosures:
false000104581000010458102020-06-092020-06-09UNITED STATESSECURITIES AND EXCHANGE COMMISSIONWASHINGTON, DC 20549______________FORM8-KCURRENT REPORTPURSUANT TO SECTION 13 OR 15(d) OFTHE SECURITIES EXCHANGE ACT OF 1934Date of Report (Date of earliest event reported):June 9, 2020NVIDIA CORPORATION(Exact name of registrant as specified in its charter)Delaware0-2398594-3177549(State or other jurisdiction(Commission(IRS Employerof incorporation)File Number)Identification No.)2788 San Tomas Expressway,Santa Clara,CA95051(Address of principal executive offices)   (Zip Code)Registrant’s telephone number, including area code:(408)486-2000Not Applicable(Former name or former address, if changed since last report)Check the appropriate box below if the Form 8-K filing is intended to simultaneously satisfy the filing obligation of the registrant under any of the following provisions:☐Written communications pursuant to Rule 425 under the Se

In [11]:
# Store results for DataFrame creation
data_list = []

if filing_metadata:
    filings = filing_metadata.get('filings', {}).get('recent', None)

    if filings:
        forms_df = pd.DataFrame.from_dict(filings)
        eight_k_filings = forms_df[forms_df['form'] == '8-K']

        if not eight_k_filings.empty:
            accession_numbers = eight_k_filings['accessionNumber'].tolist()
            filing_urls = generate_form_8k_urls(cik_number, accession_numbers)

            headers = {"User-Agent": "MyDataRetriever/1.0 (Contact: your_email@example.com)"}
            for url in filing_urls:
                response = requests.get(url, headers=headers)
                time.sleep(1)

                if response.status_code == 200:
                    filing_content = response.text
                    soup = BeautifulSoup(filing_content, 'html.parser')
                    disclosure = extract_nvda_disclosures_xml(soup)

                    # Append formatted disclosure data to the list
                    for item, content in disclosure.items():
                        formatted_content = content.replace(' | ', '\n')  # Ensure table data is separated properly
                        data_list.append({
                            'CIK': cik_number,
                            'Accession Number': url.split('/')[-2],
                            'Item': item,
                            'Disclosure Text': formatted_content
                        })
                else:
                    print(f"Failed to fetch filing content for URL: {url}")
        else:
            print("No '8-K' reports found for the selected CIK number.")
    else:
        print("No recent filings found in the JSON data.")
else:
    print("Failed to retrieve filing metadata.")

# Create DataFrame from the results
filing_df = pd.DataFrame(data_list, columns=['CIK', 'Accession Number', 'Item', 'Disclosure Text'])

In [12]:
# Display the full text of the 'Item' column in your DataFrame
filing_df.head()

Unnamed: 0,CIK,Accession Number,Item,Disclosure Text
0,1045810,104581020000103,false000104581000010458102020-06-092020-06-09U...,false000104581000010458102020-06-092020-06-09 ...
1,1045810,104581020000103,Item 5.02. Departure of Directors or Certain O...,Amendment and Restatement of Amended and Resta...
2,1045810,104581020000103,Item 5.07. Submission of Matters to a Vote of ...,"At the 2020 Annual Meeting, the following pro..."
3,1045810,104581020000103,Item 9.01 Financial Statements and Exhibits.,(d) Exhibits Exhibit NumberDescription...
4,1045810,104581020000052,false000104581000010458102020-04-272020-04-27U...,false000104581000010458102020-04-272020-04-27 ...


## Preprocessing SEC Form 8-K Data with Text Cleaning and Tokenization
*This code applies text cleaning, tokenization, and lemmatization functions to preprocess Form 8-K disclosure data from the SEC EDGAR site. It organizes the data by converting report dates, merging columns, and handling missing values. The cleaned and tokenized disclosure text is combined with other filing details into a structured DataFrame for further analysis, ensuring proper date handling and eliminating incomplete entries.*

In [13]:
# Apply preprocessing to the 'Disclosure Text' column in your existing DataFrame
filing_df = preprocess_disclosure_text(filing_df)

# Display the DataFrame with the processed disclosure text
filing_df.head()

Unnamed: 0,CIK,Accession Number,Item,Disclosure Text,Processed Disclosure Text
0,1045810,104581020000103,false000104581000010458102020-06-092020-06-09U...,false000104581000010458102020-06-092020-06-09 ...,false ---- washington dc ______________ -k cur...
1,1045810,104581020000103,Item 5.02. Departure of Directors or Certain O...,Amendment and Restatement of Amended and Resta...,amendment restatement amended restated equity ...
2,1045810,104581020000103,Item 5.07. Submission of Matters to a Vote of ...,"At the 2020 Annual Meeting, the following pro...",annual following proposal adopted margin indic...
3,1045810,104581020000103,Item 9.01 Financial Statements and Exhibits.,(d) Exhibits Exhibit NumberDescription...,exhibit exhibit numberdescription amended rest...
4,1045810,104581020000052,false000104581000010458102020-04-272020-04-27U...,false000104581000010458102020-04-272020-04-27 ...,false ---- washington dc ______________ -k cur...


In [14]:
# Convert 'reportDate' to datetime
filing_df['reportDate'] = pd.to_datetime(forms_df['reportDate'], errors='coerce')

# Reorder columns with 'reportDate' first
columns_order = ['reportDate', 'CIK', 'Accession Number', 'Item', 'Disclosure Text', 'Processed Disclosure Text']
filing_df = filing_df[columns_order]

# Merge 'reportDate' into 'filing_df' using 'Accession Number' with suffixes
filing_df = filing_df.merge(forms_df[['accessionNumber', 'reportDate']],
                            left_on='Accession Number',
                            right_on='accessionNumber',
                            how='left',
                            suffixes=('_filing', '_form'))

# Drop redundant 'accessionNumber' and 'reportDate_form' columns
filing_df.drop(columns=['accessionNumber', 'reportDate_form'], inplace=True)

# Rename 'reportDate_filing' to 'reportDate'
filing_df.rename(columns={'reportDate_filing': 'reportDate'}, inplace=True)

# Display updated DataFrame
filing_df.head()

Unnamed: 0,reportDate,CIK,Accession Number,Item,Disclosure Text,Processed Disclosure Text
0,2024-10-03,1045810,104581020000103,false000104581000010458102020-06-092020-06-09U...,false000104581000010458102020-06-092020-06-09 ...,false ---- washington dc ______________ -k cur...
1,2024-09-27,1045810,104581020000103,Item 5.02. Departure of Directors or Certain O...,Amendment and Restatement of Amended and Resta...,amendment restatement amended restated equity ...
2,2024-09-24,1045810,104581020000103,Item 5.07. Submission of Matters to a Vote of ...,"At the 2020 Annual Meeting, the following pro...",annual following proposal adopted margin indic...
3,NaT,1045810,104581020000103,Item 9.01 Financial Statements and Exhibits.,(d) Exhibits Exhibit NumberDescription...,exhibit exhibit numberdescription amended rest...
4,2024-09-20,1045810,104581020000052,false000104581000010458102020-04-272020-04-27U...,false000104581000010458102020-04-272020-04-27 ...,false ---- washington dc ______________ -k cur...


In [15]:
# Sort by 'Accession Number' and 'reportDate'
filing_df = filing_df.sort_values(['Accession Number', 'reportDate'])

# Fill missing 'reportDate' values using closest dates in each group
filing_df['reportDate'] = filing_df.groupby('Accession Number')['reportDate'].transform(lambda x: x.ffill().bfill())

# Display the updated DataFrame
filing_df.head()

Unnamed: 0,reportDate,CIK,Accession Number,Item,Disclosure Text,Processed Disclosure Text
80,2024-07-02,1045810,104581016000292,Item 2.02 Results of Operations and Financial ...,"On August 11, 2016, NVIDIA Corporation, or th...",august company issued press release announcing...
81,2024-07-02,1045810,104581016000292,Item 9.01 Financial Statements and Exhibits.,(d) Exhibits ExhibitDescription99.1Press Rele...,exhibit exhibitdescription press release dated...
78,NaT,1045810,104581016000343,Item 2.02 Results of Operations and Financial ...,"On November 10, 2016, NVIDIA Corporation, or ...",november company issued press release announci...
79,NaT,1045810,104581016000343,Item 9.01 Financial Statements and Exhibits.,(d) Exhibits ExhibitDescription99.1Press Rele...,exhibit exhibitdescription press release dated...
77,2024-07-05,1045810,104581016000356,Item 9.01. Financial Statements and Exhibits.,(d) Exhibits Exhibit NumberDescription3.1Byl...,exhibit exhibit numberdescription bylaw amende...


In [16]:
# Drop rows where 'reportDate' is NaT
filing_df = filing_df.dropna(subset=['reportDate'])

# Display the updated DataFrame
filing_df.head()

Unnamed: 0,reportDate,CIK,Accession Number,Item,Disclosure Text,Processed Disclosure Text
80,2024-07-02,1045810,104581016000292,Item 2.02 Results of Operations and Financial ...,"On August 11, 2016, NVIDIA Corporation, or th...",august company issued press release announcing...
81,2024-07-02,1045810,104581016000292,Item 9.01 Financial Statements and Exhibits.,(d) Exhibits ExhibitDescription99.1Press Rele...,exhibit exhibitdescription press release dated...
77,2024-07-05,1045810,104581016000356,Item 9.01. Financial Statements and Exhibits.,(d) Exhibits Exhibit NumberDescription3.1Byl...,exhibit exhibit numberdescription bylaw amende...
76,2024-07-05,1045810,104581016000356,Item 5.03. Amendments to Articles of Incorpora...,"On November 29, 2016, the Board of Directors (...",november board director board amended restated...
73,2024-07-09,1045810,104581016000358,Item 3.02. Unregistered Sales of Equity Securi...,,


In [17]:
# Count the number of NaT values in the 'reportDate' column
num_nat = filing_df['reportDate'].isna().sum()

print(f"Number of NaT values in the 'reportDate' column: {num_nat}")

Number of NaT values in the 'reportDate' column: 0
