In [1]:
#!/usr/bin/env python3
import argparse
from pathlib import Path

import pandas as pd

def concat_csvs(folder: str) -> pd.DataFrame:
    """
    Read every .csv in `folder`, drop html_content/html_content_cleaned if they exist,
    and return a single concatenated DataFrame.
    """
    folder = Path(folder)
    dfs = []
    for csv_path in folder.glob("*.csv"):
        df = pd.read_csv(csv_path)
        # drop unwanted columns silently if they don't exist
        df = df.drop(columns=["html_content", "html_content_cleaned"], errors="ignore")
        dfs.append(df)

    if not dfs:
        return pd.DataFrame()  # empty if no CSVs found
    return pd.concat(dfs, ignore_index=True)

df = concat_csvs('intermediate-outputs')

In [2]:
df

Unnamed: 0,file_name,url,Has_Coffee_Service,Has_Coffee_Service_Explanation,Has_Vending_Machine_Service,Has_Vending_Machine_Service_Explanation,Has_Micro_Market_Service,Has_Micro_Market_Service_Explanation,Decision_Maker_Name,Decision_Maker_Title,Decision_Maker_Phone,Decision_Maker_Email_Address,Business_Name,Business_Location,Business_Summary,Canteen_Value_Proposition
0,American-Cold-Storage-6DRYQTNL2M.html,https://www.brewersassociation.org/member-news...,undetermined,The scraped text does not contain information ...,undetermined,The scraped text does not contain information ...,undetermined,The scraped text does not contain information ...,Patrick Smith,Regional Sales Manager,(877) 224-8674,undetermined,"Polar King International, Inc.","Fort Wayne, IN","Polar King International, Inc. is an industry ...",Canteen can offer Polar King International sol...
1,American-Cold-Storage-6HBTKIMMSO.html,https://www.defensemwr.com/columbus/food-servi...,True,"The scraped text mentions 'Cappuccino, Latte, ...",True,The text explicitly mentions that all food ser...,True,The scraped text mentions multiple micro marke...,undetermined,undetermined,undetermined,undetermined,Defense Supply Center Columbus,"Whitehall, OH",The Defense Supply Center Columbus (DSCC) is a...,Canteen can provide a comprehensive refreshmen...
2,American-Cold-Storage-7C54A33PL5.html,https://www.gcca.org/resource/2023-gcca-north-...,undetermined,The scraped text does not provide any informat...,undetermined,The scraped text does not provide any informat...,undetermined,The scraped text does not provide any informat...,undetermined,undetermined,undetermined,undetermined,Global Cold Chain Alliance (GCCA),"Arlington, Virginia, USA",The Global Cold Chain Alliance (GCCA) is an or...,GCCA members require solutions to maintain pro...
3,American-Cold-Storage-7SGA62TLNP.html,https://www.verticalcold.com/,undetermined,The scraped text does not mention any informat...,undetermined,The scraped text does not mention any informat...,undetermined,The scraped text does not mention any informat...,Jim Henderson,Chief Commercial Officer,undetermined,undetermined,Vertical Cold Storage,"Kansas City, MO",Vertical Cold Storage is a cold storage compan...,Canteen can provide Vertical Cold Storage with...
4,American-Cold-Storage-AJ35OZCFEU.html,https://www.vendingtimes.com/blogs/revolutioni...,undetermined,The article discusses micro markets and unatte...,undetermined,The article discusses micro markets and unatte...,True,The article extensively discusses micro market...,Randy Skyba,Vice President of Sales and Marketing,undetermined,undetermined,Due North,undetermined,"The article discusses Due North, a company tha...",Canteen can leverage its expertise in unattend...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,North-Plaza-VJHFDJUNUG.html,https://www.livewellsd.org/i-want-to/get-invol...,undetermined,The scraped text does not contain any informat...,undetermined,The scraped text does not contain any informat...,undetermined,The scraped text does not contain any informat...,undetermined,undetermined,undetermined,HHSA.Communities@sdcounty.ca.gov,Live Well San Diego - North Central Region,"5469 Kearny Villa Road, San Diego, CA 92123",The Live Well San Diego - North Central Region...,Canteen can partner with Live Well San Diego -...
58,North-Plaza-VRNXJDFS2Q.html,https://www.youngerpartners.com/property-type/...,undetermined,The scraped text does not contain any specific...,undetermined,The scraped text does not contain any specific...,undetermined,The scraped text does not contain any specific...,undetermined,undetermined,undetermined,undetermined,Younger Partners,"14643 Dallas Pkwy., Ste. 950, LB#58 Dallas, TX...",Younger Partners is a commercial real estate f...,Canteen can provide a tailored refreshment pro...
59,North-Plaza-YAJZ7AFNDS.html,https://www.nashvillegi.com/,undetermined,The scraped text '403 Unauthorized' does not p...,undetermined,The scraped text '403 Unauthorized' does not p...,undetermined,The scraped text '403 Unauthorized' does not p...,undetermined,undetermined,undetermined,undetermined,Unknown,Unknown,The provided text indicates an unauthorized ac...,Due to the lack of information about the busin...
60,North-Plaza-YG6YQ4BTMY.html,https://www.freshfarm.org/markets/downtown-sil...,False,The text mentions Zeke's Coffee as a vendor at...,False,There is no mention of vending machines in the...,False,"The text describes a farmers market, not a mic...",undetermined,undetermined,undetermined,undetermined,FRESHFARM,"Washington, DC",FRESHFARM is a non-profit organization that op...,"FRESHFARM, as an organization focused on local..."
