In [2]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords

In [40]:
import requests
import pandas as pd
import time

headers = {
    'User-Agent': 'Brian Lu brian901231@gmail.com'
}

def get_pharma_companies():
    """Get list of pharmaceutical companies with their CIK codes"""
    print("Getting pharmaceutical companies...")
    
    # Get company list
    response = requests.get("https://www.sec.gov/files/company_tickers.json", headers=headers)
    companies_data = response.json()
    
    pharma_companies = []
    pharma_sic_codes = [2833, 2834, 2835, 2836]
    
    for company in companies_data.values():
        cik = str(company['cik_str']).zfill(10)
        url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                data = response.json()
                sic = data.get('sic')
                if sic and int(sic) in pharma_sic_codes:
                    pharma_companies.append({
                        'company_name': company['title'],
                        'cik': cik
                    })
                    print(f"Found pharma company: {company['title']}")
            time.sleep(0.1)
        except Exception as e:
            print(f"Error processing {company['title']}: {e}")
    
    return pd.DataFrame(pharma_companies)

def get_10k_text(cik, accession_no):
    """Get raw text content of 10-K filing"""
    try:
        url = f"https://www.sec.gov/Archives/edgar/data/{cik.lstrip('0')}/{accession_no}.txt"
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve text for CIK {cik}, accession {accession_no}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error getting text for CIK {cik}, accession {accession_no}: {e}")
        return None

In [None]:
# Get pharmaceutical companies
companies_df = get_pharma_companies()
print(f"\nFound {len(companies_df)} pharmaceutical companies")

# export companies_df to csv
companies_df.to_csv('companies_df.csv', index=False)

Getting pharmaceutical companies...
Found pharma company: ELI LILLY & Co
Found pharma company: JOHNSON & JOHNSON
Found pharma company: NOVO NORDISK A S
Found pharma company: AbbVie Inc.
Found pharma company: ASTRAZENECA PLC
Found pharma company: ABBOTT LABORATORIES
Found pharma company: Merck & Co., Inc.
Found pharma company: NOVARTIS AG
Found pharma company: AMGEN INC
Found pharma company: PFIZER INC
Found pharma company: Sanofi
Found pharma company: GILEAD SCIENCES, INC.
Found pharma company: VERTEX PHARMACEUTICALS INC / MA
Found pharma company: BRISTOL MYERS SQUIBB CO
Found pharma company: GSK plc
Found pharma company: Zoetis Inc.
Found pharma company: REGENERON PHARMACEUTICALS, INC.
Found pharma company: TAKEDA PHARMACEUTICAL CO LTD
Found pharma company: ARGENX SE
Found pharma company: IDEXX LABORATORIES INC /DE
Found pharma company: ALNYLAM PHARMACEUTICALS, INC.
Found pharma company: BioNTech SE
Found pharma company: BeiGene, Ltd.
Error processing MARKEL GROUP INC.: ('Connection a

Note we found that there are 1037 pharama companies

In [41]:
def create_10k_dataset(companies_df, start_year=2010, max_companies=None):
    """Create dataset with 10-K texts for all pharma companies"""
    
    print(f"\nStarting with {len(companies_df)} companies")

    # Limit number of companies if specified
    if max_companies:
        companies_df = companies_df.head(max_companies)
        print(f"Processing first {max_companies} companies as a test")
    

    # Create dataset
    dataset = []
    
    for _, company in companies_df.iterrows():
        print(f"\nProcessing {company['company_name']}...")
        
        # Get company's filings
        url = f"https://data.sec.gov/submissions/CIK{company['cik']}.json"
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                data = response.json()
                recent_filings = data.get('filings', {}).get('recent', {})
                
                if recent_filings:
                    forms = recent_filings.get('form', [])
                    for idx, form in enumerate(forms):
                        if form == '10-K':
                            year = int(recent_filings.get('filingDate', [])[idx][:4])
                            if year >= start_year:
                                accession_no = recent_filings.get('accessionNumber', [])[idx]
                                # Get 10-K text
                                text = get_10k_text(company['cik'], accession_no)
                                if text:
                                    dataset.append({
                                        'company_name': company['company_name'],
                                        'cik': company['cik'],
                                        'year': year,
                                        '10k_text': text
                                    })
                                else:
                                    print("Failed to get text content")
                
            time.sleep(0.1)
        except Exception as e:
            print(f"Error processing filings for {company['company_name']}: {e}")
    
    # Create final DataFrame
    df = pd.DataFrame(dataset)
    df = df.sort_values(['company_name', 'year'])
    
    print(f"\nDataset created with {len(df)} 10-K filings")
    return df

# Create the dataset
pharma_10k_df = create_10k_dataset(companies_df, start_year=2010)

# Save the dataset (using pickle for large text data)
pharma_10k_df.to_pickle('pharma_10k_dataset.pkl')

# Display sample (without the text content)
print("\nSample of the dataset:")
display(pharma_10k_df)


Starting with 1037 companies

Processing ELI LILLY & Co...

Processing JOHNSON & JOHNSON...

Processing NOVO NORDISK A S...

Processing AbbVie Inc....

Processing ASTRAZENECA PLC...

Processing ABBOTT LABORATORIES...

Processing Merck & Co., Inc....

Processing NOVARTIS AG...

Processing AMGEN INC...

Processing PFIZER INC...

Processing Sanofi...

Processing GILEAD SCIENCES, INC....

Processing VERTEX PHARMACEUTICALS INC / MA...

Processing BRISTOL MYERS SQUIBB CO...

Processing GSK plc...

Processing Zoetis Inc....

Processing REGENERON PHARMACEUTICALS, INC....

Processing TAKEDA PHARMACEUTICAL CO LTD...

Processing ARGENX SE...

Processing IDEXX LABORATORIES INC /DE...

Processing ALNYLAM PHARMACEUTICALS, INC....

Processing BioNTech SE...

Processing BeiGene, Ltd....

Processing BIOGEN INC....

Processing TEVA PHARMACEUTICAL INDUSTRIES LTD...

Processing Royalty Pharma plc...

Processing Summit Therapeutics Inc....

Processing UNITED THERAPEUTICS Corp...

Processing INSMED Inc...


Unnamed: 0,company_name,cik,year,10k_text
4940,180 Life Sciences Corp.,0001690080,2018,<SEC-DOCUMENT>0001213900-18-003632.txt : 20180...
6068,180 Life Sciences Corp.,0001690080,2018,<SEC-DOCUMENT>0001213900-18-003632.txt : 20180...
4939,180 Life Sciences Corp.,0001690080,2019,<SEC-DOCUMENT>0001213900-19-005536.txt : 20190...
6067,180 Life Sciences Corp.,0001690080,2019,<SEC-DOCUMENT>0001213900-19-005536.txt : 20190...
4938,180 Life Sciences Corp.,0001690080,2020,<SEC-DOCUMENT>0001213900-20-008783.txt : 20200...
...,...,...,...,...
3199,vTv Therapeutics Inc.,0001641489,2020,<SEC-DOCUMENT>0001564590-20-005627.txt : 20200...
3198,vTv Therapeutics Inc.,0001641489,2021,<SEC-DOCUMENT>0001564590-21-008149.txt : 20210...
3197,vTv Therapeutics Inc.,0001641489,2022,<SEC-DOCUMENT>0001564590-22-012258.txt : 20220...
3196,vTv Therapeutics Inc.,0001641489,2023,<SEC-DOCUMENT>0001641489-23-000008.txt : 20230...


In the following section, we will use the pharma_10k_df dataset to analyze the difference in frequent words before and after the application of GDPR, which is 2018.


In [1]:
# import pharma_10k_dataset as pharma_1ok_data if needed
pharma_10k_df = pd.read_pickle('pharma_10k_dataset.pkl')
display(pharma_10k_df)

Unnamed: 0,company_name,cik,year,10k_text
4940,180 Life Sciences Corp.,0001690080,2018,<SEC-DOCUMENT>0001213900-18-003632.txt : 20180...
6068,180 Life Sciences Corp.,0001690080,2018,<SEC-DOCUMENT>0001213900-18-003632.txt : 20180...
4939,180 Life Sciences Corp.,0001690080,2019,<SEC-DOCUMENT>0001213900-19-005536.txt : 20190...
6067,180 Life Sciences Corp.,0001690080,2019,<SEC-DOCUMENT>0001213900-19-005536.txt : 20190...
4938,180 Life Sciences Corp.,0001690080,2020,<SEC-DOCUMENT>0001213900-20-008783.txt : 20200...
...,...,...,...,...
3199,vTv Therapeutics Inc.,0001641489,2020,<SEC-DOCUMENT>0001564590-20-005627.txt : 20200...
3198,vTv Therapeutics Inc.,0001641489,2021,<SEC-DOCUMENT>0001564590-21-008149.txt : 20210...
3197,vTv Therapeutics Inc.,0001641489,2022,<SEC-DOCUMENT>0001564590-22-012258.txt : 20220...
3196,vTv Therapeutics Inc.,0001641489,2023,<SEC-DOCUMENT>0001641489-23-000008.txt : 20230...


In [6]:
# Clean the 10K text, remove the html tags and special characters
from bs4 import BeautifulSoup
import re

def clean_10k_text(html_text):
    """Clean 10-K text by removing HTML tags and special characters"""
    try:
        # Parse HTML
        soup = BeautifulSoup(html_text, 'html.parser')
        
        # Get text content
        text = soup.get_text()
        
        # Remove special characters and clean up
        text = re.sub(r'&nbsp;', ' ', text)  # Replace &nbsp; with space
        text = re.sub(r'\n+', '\n', text)  # Replace multiple newlines with single
        text = re.sub(r'\s+', ' ', text)   # Replace multiple spaces with single
        text = text.strip()  # Remove leading/trailing whitespace
        
        return text
    except Exception as e:
        print(f"Error cleaning text: {e}")
        return None

# Add new column with cleaned text
pharma_10k_df['clean_text'] = pharma_10k_df['10k_text'].apply(clean_10k_text)

# Display sample of cleaned text
print("\nSample of cleaned text:")
print(pharma_10k_df['clean_text'].iloc[0][:500])  # First 500 characters





Error cleaning text: The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.

Original exception(s) from parser:
 AssertionError: expected name token at "<![`2MZEHD\\@V/WU[K'*"
Error cleaning text: The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.

Original exception(s) from parser:
 AssertionError: unknown status keyword 'GBHK_ ' in marked section
Error cleaning text: The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.

Original exception(s) from parser:
 AssertionError: unknown status keyword 'D8IK' in marked section
Error cleaning text: The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.

Original exception(s) from parser:
 AssertionError: expected name token at '<![2F*KA%J6]?>94BA;D'
Error cleaning text: The markup you provided was rejected by the 

In [7]:
pharma_10k_df.to_pickle('pharma_10k_dataset.pkl')

In [23]:
# filter pharma_10k_df with distinct cik and year
pharma_10k_df = pharma_10k_df.drop_duplicates(subset=['cik', 'year'])
pharma_10k_df.to_pickle('pharma_10k_dataset.pkl')
display(pharma_10k_df)

Unnamed: 0,company_name,cik,year,10k_text,clean_text
4940,180 Life Sciences Corp.,0001690080,2018,<SEC-DOCUMENT>0001213900-18-003632.txt : 20180...,txt hdrsgml accession number conformed submiss...
4939,180 Life Sciences Corp.,0001690080,2019,<SEC-DOCUMENT>0001213900-19-005536.txt : 20190...,txt hdrsgml accession number conformed submiss...
4938,180 Life Sciences Corp.,0001690080,2020,<SEC-DOCUMENT>0001213900-20-008783.txt : 20200...,txt hdrsgml accession number conformed submiss...
4937,180 Life Sciences Corp.,0001690080,2021,<SEC-DOCUMENT>0001213900-21-036392.txt : 20210...,txt hdrsgml accession number conformed submiss...
4936,180 Life Sciences Corp.,0001690080,2022,<SEC-DOCUMENT>0001213900-22-016352.txt : 20220...,txt hdrsgml accession number conformed submiss...
...,...,...,...,...,...
3199,vTv Therapeutics Inc.,0001641489,2020,<SEC-DOCUMENT>0001564590-20-005627.txt : 20200...,0001564590-20-005627.txt : 20200221 0001564590...
3198,vTv Therapeutics Inc.,0001641489,2021,<SEC-DOCUMENT>0001564590-21-008149.txt : 20210...,0001564590-21-008149.txt : 20210224 0001564590...
3197,vTv Therapeutics Inc.,0001641489,2022,<SEC-DOCUMENT>0001564590-22-012258.txt : 20220...,0001564590-22-012258.txt : 20220329 0001564590...
3196,vTv Therapeutics Inc.,0001641489,2023,<SEC-DOCUMENT>0001641489-23-000008.txt : 20230...,0001641489-23-000008.txt : 20230306 0001641489...


In [3]:
# load pharma_10k_df
pharma_10k_df = pd.read_pickle('pharma_10k_dataset.pkl')

In [8]:
# Filter out the rows with None in the clean_text column
pharma_10k_df = pharma_10k_df[pharma_10k_df['clean_text'].notna()]

# Select 

Unnamed: 0,company_name,cik,year,10k_text,clean_text
4940,180 Life Sciences Corp.,0001690080,2018,<SEC-DOCUMENT>0001213900-18-003632.txt : 20180...,txt hdrsgml accession number conformed submiss...
4939,180 Life Sciences Corp.,0001690080,2019,<SEC-DOCUMENT>0001213900-19-005536.txt : 20190...,txt hdrsgml accession number conformed submiss...
4938,180 Life Sciences Corp.,0001690080,2020,<SEC-DOCUMENT>0001213900-20-008783.txt : 20200...,txt hdrsgml accession number conformed submiss...
4937,180 Life Sciences Corp.,0001690080,2021,<SEC-DOCUMENT>0001213900-21-036392.txt : 20210...,txt hdrsgml accession number conformed submiss...
4936,180 Life Sciences Corp.,0001690080,2022,<SEC-DOCUMENT>0001213900-22-016352.txt : 20220...,txt hdrsgml accession number conformed submiss...
...,...,...,...,...,...
3199,vTv Therapeutics Inc.,0001641489,2020,<SEC-DOCUMENT>0001564590-20-005627.txt : 20200...,0001564590-20-005627.txt : 20200221 0001564590...
3198,vTv Therapeutics Inc.,0001641489,2021,<SEC-DOCUMENT>0001564590-21-008149.txt : 20210...,0001564590-21-008149.txt : 20210224 0001564590...
3197,vTv Therapeutics Inc.,0001641489,2022,<SEC-DOCUMENT>0001564590-22-012258.txt : 20220...,0001564590-22-012258.txt : 20220329 0001564590...
3196,vTv Therapeutics Inc.,0001641489,2023,<SEC-DOCUMENT>0001641489-23-000008.txt : 20230...,0001641489-23-000008.txt : 20230306 0001641489...


In the following section, we will clean the company names and only keep those existing in our existing company list

In [18]:
# Get the list of all company names from pharma_10k_df and clean them 
# Get unique company names
company_names = pharma_10k_df['company_name'].unique()

# Clean company names by removing special characters and standardizing format, but keep the numbers
cleaned_company_names = []
for name in company_names:
    # Remove special characters, convert to lowercase
    cleaned_name = re.sub(r'[^a-zA-Z0-9\s]', '', name).lower().strip()
    # Remove common company suffixes
    cleaned_name = re.sub(r'\b(llc|inc|corp|corporation|limited|co|company|ltd)\b', '', cleaned_name).strip()
    cleaned_company_names.append(cleaned_name)

# Create a dictionary mapping original names to cleaned names
company_name_mapping = dict(zip(company_names, cleaned_company_names))
display(company_name_mapping)
display(len(company_name_mapping))



{'180 Life Sciences Corp.': '180 life sciences',
 '23andMe Holding Co.': '23andme holding',
 '2seventy bio, Inc.': '2seventy bio',
 '4D Molecular Therapeutics, Inc.': '4d molecular therapeutics',
 '4Front Ventures Corp.': '4front ventures',
 '60 DEGREES PHARMACEUTICALS, INC.': '60 degrees pharmaceuticals',
 '89bio, Inc.': '89bio',
 'ABBOTT LABORATORIES': 'abbott laboratories',
 'ABEONA THERAPEUTICS INC.': 'abeona therapeutics',
 'ABVC BIOPHARMA, INC.': 'abvc biopharma',
 'ACADIA PHARMACEUTICALS INC': 'acadia pharmaceuticals',
 'ACELYRIN, Inc.': 'acelyrin',
 'ACHIEVE LIFE SCIENCES, INC.': 'achieve life sciences',
 'ACURA PHARMACEUTICALS, INC': 'acura pharmaceuticals',
 'ADC Therapeutics SA': 'adc therapeutics sa',
 'ADIAL PHARMACEUTICALS, INC.': 'adial pharmaceuticals',
 'ADMA BIOLOGICS, INC.': 'adma biologics',
 'AEON Biopharma, Inc.': 'aeon biopharma',
 'AGENUS INC': 'agenus',
 'AGIOS PHARMACEUTICALS, INC.': 'agios pharmaceuticals',
 'AGRO CAPITAL MANAGEMENT CORP.': 'agro capital mana

742

In [None]:
# load the company list from file with path C:\D\work\Health Innovation Lab\sponsor_id_foundedyear_firsttrialyear_0122.csv
existing_company_list = pd.read_csv('C:/D/work/Health Innovation Lab/sponsor_id_foundedyear_firsttrialyear_0122.csv')
display(existing_company_list)

# Get the set of cleaned sponsor names
cleaned_sponsor_names = set(existing_company_list['sponsor_name'])

# Find companies that exist in both datasets
matching_companies = {original: cleaned for original, cleaned in company_name_mapping.items() 
                     if cleaned in cleaned_sponsor_names}

print(f"Found {len(matching_companies)} matching companies")
print("\nMatching companies:")
for original, cleaned in matching_companies.items():
    print(f"{original} -> {cleaned}")

# Filter pharma_10k_df to only include matching companies, which has 455 companies
pharma_10k_df = pharma_10k_df[pharma_10k_df['company_name'].isin(matching_companies.keys())] # 3480 rows
display(pharma_10k_df)




Unnamed: 0.1,Unnamed: 0,sponsor_name,sponsor_id,year_founded,first_trial_year,industry_dummy,no_trial_before_2010,first_p2trial_year,no_p2trial_before_2010,sponsor_name_eu,sponsor_name_aact
0,1,1 frauenklinik der lmu innenstadt,4,,2005,0.0,0,,,1 frauenklinik der lmu innenstadt,
1,2,101 therapeutics,8,,2022,1.0,1,2022.0,1.0,,101 therapeutics
2,3,10xbio,17,2015.0,2019,1.0,1,2019.0,1.0,,10xbio
3,4,1200 pharma,33,2017.0,2021,1.0,1,,,,1200 pharma
4,5,1globe biomedical,51,,2018,1.0,1,2023.0,1.0,,1globe biomedical
...,...,...,...,...,...,...,...,...,...,...,...
28747,28748,zynerba pharmaceuticals,87960,2007.0,2018,1.0,1,2020.0,1.0,,zynerba pharmaceuticals
28748,28749,zynex monitoring solutions,87152,,2011,1.0,1,,,,zynex monitoring solutions
28749,28750,zytoprotec gmbh,87158,2007.0,2014,1.0,1,2014.0,1.0,zytoprotec gmbh,
28750,28751,zyus life sciences,87129,,2010,1.0,1,2011.0,1.0,,zyus life sciences


Found 455 matching companies

Matching companies:
2seventy bio, Inc. -> 2seventy bio
4D Molecular Therapeutics, Inc. -> 4d molecular therapeutics
60 DEGREES PHARMACEUTICALS, INC. -> 60 degrees pharmaceuticals
89bio, Inc. -> 89bio
ABBOTT LABORATORIES -> abbott laboratories
ABEONA THERAPEUTICS INC. -> abeona therapeutics
ACADIA PHARMACEUTICALS INC -> acadia pharmaceuticals
ACELYRIN, Inc. -> acelyrin
ACHIEVE LIFE SCIENCES, INC. -> achieve life sciences
ACURA PHARMACEUTICALS, INC -> acura pharmaceuticals
ADC Therapeutics SA -> adc therapeutics sa
ADIAL PHARMACEUTICALS, INC. -> adial pharmaceuticals
ADMA BIOLOGICS, INC. -> adma biologics
AEON Biopharma, Inc. -> aeon biopharma
AGENUS INC -> agenus
AGIOS PHARMACEUTICALS, INC. -> agios pharmaceuticals
AIM ImmunoTech Inc. -> aim immunotech
ALNYLAM PHARMACEUTICALS, INC. -> alnylam pharmaceuticals
AMGEN INC -> amgen
AMICUS THERAPEUTICS, INC. -> amicus therapeutics
AN2 Therapeutics, Inc. -> an2 therapeutics
ANAPTYSBIO, INC -> anaptysbio
ANAVEX LIF

Unnamed: 0,company_name,cik,year,10k_text,clean_text
2489,"2seventy bio, Inc.",0001860782,2022,<SEC-DOCUMENT>0001860782-22-000005.txt : 20220...,0001860782-22-000005.txt : 20220322 0001860782...
2488,"2seventy bio, Inc.",0001860782,2023,<SEC-DOCUMENT>0001860782-23-000038.txt : 20230...,0001860782-23-000038.txt : 20230316 0001860782...
2487,"2seventy bio, Inc.",0001860782,2024,<SEC-DOCUMENT>0001860782-24-000027.txt : 20240...,0001860782-24-000027.txt : 20240307 0001860782...
2152,"4D Molecular Therapeutics, Inc.",0001650648,2021,<SEC-DOCUMENT>0001564590-21-015502.txt : 20210...,0001564590-21-015502.txt : 20210325 0001564590...
2151,"4D Molecular Therapeutics, Inc.",0001650648,2022,<SEC-DOCUMENT>0000950170-22-004794.txt : 20220...,0000950170-22-004794.txt : 20220328 0000950170...
...,...,...,...,...,...
3199,vTv Therapeutics Inc.,0001641489,2020,<SEC-DOCUMENT>0001564590-20-005627.txt : 20200...,0001564590-20-005627.txt : 20200221 0001564590...
3198,vTv Therapeutics Inc.,0001641489,2021,<SEC-DOCUMENT>0001564590-21-008149.txt : 20210...,0001564590-21-008149.txt : 20210224 0001564590...
3197,vTv Therapeutics Inc.,0001641489,2022,<SEC-DOCUMENT>0001564590-22-012258.txt : 20220...,0001564590-22-012258.txt : 20220329 0001564590...
3196,vTv Therapeutics Inc.,0001641489,2023,<SEC-DOCUMENT>0001641489-23-000008.txt : 20230...,0001641489-23-000008.txt : 20230306 0001641489...


In [26]:
pharma_10k_df.to_pickle('pharma_10k_dataset.pkl')

In [None]:
# import pharma_10k_df
pharma_10k_df = pd.read_pickle('pharma_10k_dataset.pkl')
display(pharma_10k_df)

#### Pick up here


Now, we want to clean the 10k text data of the pharma_10k_df

In [1]:
# Load stop word from nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt') 
stop_words = set(stopwords.words('english'))
stop_words = set(re.sub(r'[^a-zA-Z\s]', '', word) for word in stop_words) # remove non-alphabetic characters from stop words


# change the clean_text column by removing all non-alphabetic characters, turn them into lowercase, also remove stop words
pharma_10k_df['clean_text'] = pharma_10k_df['clean_text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x).lower())
pharma_10k_df['clean_text'] = pharma_10k_df['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NameError: name 're' is not defined

Now, we perfrom the analysis

In [None]:
#pharma_10k_df.to_pickle('pharma_10k_dataset.pkl')
display(pharma_10k_df)



NameError: name 'pharma_10k_df' is not defined

In [2]:
# p

1234
