In [4]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [5]:
# Define your csv_columns list
csv_columns = ["Business Activity Description", "Business Activity Vendor", "Business Activity Cost USD", "Business Activity Comment", "2017 NAICS Title"]

file_folder = '/content/drive/My Drive/224_project/'

# Specify Google Drive folder ID and file names
folder_id = "1RJDHqFP2jlMrWUsWP0KmsDwW8hPCzx_6"

total_file_name = 'sustainability_business_activities.csv'
train_file_name = 'sustainability_business_activities_training.csv'
test_file_name = 'sustainability_business_activities_test.csv'


In [None]:
import pandas as pd
from io import StringIO

def find_file_id_by_name(drive, folder_id, file_name):
    """Search for a file by name in the specified Google Drive folder."""
    query = f"'{folder_id}' in parents and trashed=false and title='{file_name}'"
    file_list = drive.ListFile({'q': query}).GetList()
    return file_list[0]['id'] if file_list else None

# Assuming `gauth` and `drive` are already initialized
# Your DataFrames: train_dataset and test_dataset

def upload_dataframe(drive, folder_id, df, file_name):
    # Convert DataFrame to CSV string
    csv_string = df.to_csv(index=False)
    content = StringIO(csv_string)

    # Check if file exists
    file_id = find_file_id_by_name(drive, folder_id, file_name)

    if file_id:
        # File exists, update the content
        file = drive.CreateFile({'id': file_id})
        file.SetContentString(csv_string)
        file.Upload()
        print(f"Updated: {file_name}")
    else:
        # File doesn't exist, create a new one
        file = drive.CreateFile({'title': file_name, 'parents': [{'id': folder_id}]})
        file.SetContentString(csv_string)
        file.Upload()
        print(f"Created: {file_name}")

In [None]:
# Define NAICS Titles and their associated business activities, vendors, and comments
naics_info = {
    "Support Activities for Metal Mining": {
        "activities": {
            "Drilling services for zinc exploration": {
                "vendors": ["ZincDrill Corp", "MetalExplorer Inc.", "DrillZinc Solutions", "ZincQuest Services", "ExploreZinc Co."],
                "comments": [
                    "Precision drilling for accurate zinc location.",
                    "Advanced geological analysis ensured.",
                    "Minimized environmental footprint during exploration."
                ]
            },
            "Environmental impact assessment for mining": {
                "vendors": ["EcoImpact Assessors", "GreenMine Consultants", "SustainAssess Ltd.", "EcoMine Services", "EnviroImpact Analytics"],
                "comments": [
                    "Comprehensive ecosystem evaluation conducted.",
                    "Strategies for water conservation implemented.",
                    "Biodiversity preservation plans developed."
                ]
            },
            "Ore transportation logistics": {
                "vendors": ["HeavyLoad Transport", "OreMovers Inc.", "MetalFreight Co.", "RockShip Logistics", "MineCart Express"],
                "comments": [
                    "Efficient ore transport solutions.",
                    "State-of-the-art logistics management.",
                    "Eco-friendly transportation methods."
                ]
            },
            "Mine site reclamation services": {
                "vendors": ["EcoReclaim Ltd.", "LandRestore Co.", "ReGreening Services", "TerraHeal Solutions", "NatureMend Tech"],
                "comments": [
                    "Restoration of mining sites to natural state.",
                    "Promoting biodiversity in reclaimed areas.",
                    "Innovative land rehabilitation techniques."
                ]
            },
            "Mineral processing consultancy": {
                "vendors": ["MineralTech Advisors", "OreRefine Partners", "ProcessMax Consultants", "ExtractWell Analytics", "BeneficiatePro Services"],
                "comments": [
                    "Optimization of mineral processing operations.",
                    "Implementation of cost-saving techniques.",
                    "Maximizing ore recovery rates."
                ]
            },
            "Mining equipment maintenance": {
                "vendors": ["MineGear Service Co.", "DigTech Repair Solutions", "EquipFix Services", "HeavyDuty Maintenance", "DrillAndGrind Engineers"],
                "comments": [
                    "Ensuring operational efficiency and safety.",
                    "Rapid response for equipment repairs.",
                    "Comprehensive maintenance plans."
                ]
            },
            "Safety training for mine workers": {
                "vendors": ["SafeMine Training Academy", "MinerGuard Education Services", "SafetyFirst Workshops", "ProtectWell Training Programs", "HazardAware Courses"],
                "comments": [
                    "Empowering workers with safety knowledge.",
                    "Compliance with mining safety regulations.",
                    "Advanced risk management training."
                ]
            },
            "Environmental monitoring services": {
                "vendors": ["EcoWatch Services", "EnviroSense Monitoring", "GreenScan Analytics", "BioTrack Environmental", "NatureGuard Surveillance"],
                "comments": [
                    "Continuous environmental impact assessment.",
                    "Utilizing advanced monitoring technologies.",
                    "Data-driven environmental protection strategies."
                ]
            },
            "Heavy machinery leasing": {
                "vendors": ["MegaMach Leasing Co.", "HeavyLift Rental Services", "IronHorse Machinery", "PowerPlant Equipment Leasing", "ToughTrack Leases"],
                "comments": [
                    "Providing flexible leasing options.",
                    "Access to latest mining machinery models.",
                    "Cost-effective solutions for equipment needs."
                ]
            },
            "Rock blasting and removal": {
                "vendors": ["BlastAway Co.", "RockClear Services", "ExploTech Blasting", "RubbleManage Inc.", "DetonateRock Solutions"],
                "comments": [
                    "Precision blasting for site preparation.",
                    "Safe removal of blasted rock material.",
                    "Minimizing vibrations and environmental impact."
                ]
            }
        }
    },
    "Sewage Treatment Facilities": {
        "activities": {
            "Wastewater treatment process optimization": {
                "vendors": ["WaterClean Solutions", "PureFlow Tech", "EcoWater Systems", "AquaPurify Innovations", "CleanStream Technologies"],
                "comments": [
                    "Enhanced filtration techniques applied.",
                    "Energy-efficient treatment methods introduced.",
                    "Significant reduction in chemical usage achieved."
                ]
            },
            "Sludge dewatering and disposal": {
                "vendors": ["SludgeAway Technologies", "DryMatter Solutions", "EcoSludge Services", "Solidify Systems", "WasteNo Sludge Co."],
                "comments": [
                    "Advanced centrifugation for moisture reduction.",
                    "Eco-friendly disposal methods utilized.",
                    "Conversion of sludge to renewable energy explored."
                ]
            },
            "Chemical treatment of wastewater": {
                "vendors": ["ChemClean Water Solutions", "AquaChem Tech", "PureTreat Chemicals", "WasteNeutralize Enterprises", "EcoChem Services"],
                "comments": [
                    "Advanced chemical treatments for purification.",
                    "Reduction of harmful pathogens in effluent.",
                    "Eco-friendly chemical usage policies."
                ]
            },
            "Maintenance of treatment plant infrastructure": {
                "vendors": ["AquaMaintain Ltd.", "PlantCare Engineering", "FacilityFix Services", "WaterWorks Maintenance", "SewageSustain Co."],
                "comments": [
                    "Regular inspection and maintenance of facilities.",
                    "Preventative maintenance to avoid downtime.",
                    "Upgrading infrastructure for efficiency improvements."
                ]
            },
            "Odor control in sewage facilities": {
                "vendors": ["ScentGuard Technologies", "OdorShield Solutions", "FreshAir Environmental", "SmellBusters Co.", "AromaControl Systems"],
                "comments": [
                    "Implementing natural odor control measures.",
                    "Advanced filtration systems to reduce smells.",
                    "Improving community relations with odor management."
                ]
            },
            "Grease and fat removal services": {
                "vendors": ["GreaseGone Solutions", "FatFree Services", "ClearGrease Co.", "LipidLifters Inc.", "EcoFatClean Tech"],
                "comments": [
                    "Preventing blockages in sewage systems.",
                    "Eco-friendly disposal of grease and fats.",
                    "Maintaining cleanliness and flow efficiency."
                ]
            },
            "Renewable energy generation from sewage": {
                "vendors": ["BioEnergy Water Solutions", "RenewFlow Tech", "AquaPower Renewable", "WasteToWatt Co.", "EcoVolt Energy"],
                "comments": [
                    "Harnessing biogas from sewage treatment.",
                    "Contributing to sustainability goals.",
                    "Innovative energy recovery techniques."
                ]
            },
            "Sewage system digital monitoring": {
                "vendors": ["FlowSense Monitoring", "AquaTrack Technologies", "DigitalWater Watch", "EcoMonitor Solutions", "Waterwise Surveillance"],
                "comments": [
                    "Real-time data on sewage treatment operations.",
                    "Predictive maintenance with digital tools.",
                    "Enhancing system efficiency through technology."
                ]
            },
            "Public health impact assessments": {
                "vendors": ["HealthWater Consultancy", "PublicSafe Water Services", "EcoHealth Assessments", "WaterWell Public Health", "AquaHealth Analytics"],
                "comments": [
                    "Assessing the impact of sewage treatment on public health.",
                    "Strategies for mitigating health risks.",
                    "Collaboration with public health authorities."
                ]
            },
            "Stormwater runoff management": {
                "vendors": ["StormSafe Solutions", "RunoffGuard Co.", "RainFlow Management", "EcoStorm Services", "WaterRunoff Solutions"],
                "comments": [
                    "Innovative solutions for stormwater treatment.",
                    "Reducing pollution from runoff.",
                    "Improving water quality in local water bodies."
                ]
            }
        }
    },
    "New Single-Family Housing Construction (except For-Sale Builders)": {
        "activities": {
            "Architectural design for family homes": {
                "vendors": ["DesignBuild Architects", "EcoHomes Design Co.", "FutureNest Architecture", "GreenBlueprints Studio", "HarmonyDwell Designs"],
                "comments": [
                    "Custom designs to fit client lifestyles.",
                    "Incorporation of green building practices.",
                    "Focus on energy efficiency and sustainability."
                ]
            },
            "Foundation and framing services": {
                "vendors": ["SolidBase Constructors", "FrameRight Solutions", "EcoFoundation Systems", "SecureFrame Builders", "GroundUp Framing Co."],
                "comments": [
                    "Utilizing durable and sustainable materials.",
                    "Advanced techniques for structural integrity.",
                    "Timely completion of foundational and framing stages."
                ]
            },
            "Plumbing installation and fixtures": {
                "vendors": ["FlowTech Plumbing", "AquaSafe Installations", "EcoWater Plumbing Co.", "PureFlow Systems", "Streamline Plumbing Solutions"],
                "comments": [
                    "High-quality fixtures to reduce water usage.",
                    "Innovative solutions for water heating and recycling.",
                    "Reliable installation ensuring long-term service."
                ]
            },
            "Electrical wiring and smart home integration": {
                "vendors": ["BrightFuture Electric", "SmartWatt Solutions", "EcoElectrics Co.", "PowerSafe Wiring", "InnovateHome Systems"],
                "comments": [
                    "Energy-efficient lighting and appliances.",
                    "Integration of smart home technologies.",
                    "Focus on safety and energy savings."
                ]
            },
            "Roofing and insulation services": {
                "vendors": ["EcoRoof Systems", "SecureCover Roofing", "WarmthWrap Insulation", "ShieldTop Roofs", "InsulateRight Services"],
                "comments": [
                    "Sustainable roofing materials for durability.",
                    "Advanced insulation for climate control.",
                    "Options for solar panel installation."
                ]
            },
            "Landscaping and outdoor living spaces": {
                "vendors": ["GreenEscape Landscaping", "OutdoorOasis Designs", "NatureNest Landscape Co.", "EcoYard Landscaping", "LivingSpaces Outdoors"],
                "comments": [
                    "Designs that complement natural surroundings.",
                    "Creating functional and beautiful outdoor spaces.",
                    "Incorporation of native plants and eco-friendly practices."
                ]
            },
            "Interior finishing and painting": {
                "vendors": ["InsideStyle Finishers", "ColorCraft Painters", "EcoInterior Coatings", "FinishLine Interiors", "BrightWalls Solutions"],
                "comments": [
                    "Use of low-VOC paints for indoor air quality.",
                    "Attention to detail in finishing work.",
                    "Custom interior designs to match homeowner preferences."
                ]
            },
            "Flooring installation and treatments": {
                "vendors": ["EcoFloors Installations", "GroundArt Flooring", "NatureWalk Surfaces", "StepWell Flooring Co.", "DuraFloor Solutions"],
                "comments": [
                    "Eco-friendly flooring options available.",
                    "Durable treatments for high-traffic areas.",
                    "Wide range of materials and finishes."
                ]
            },
            "Window and door installation": {
                "vendors": ["ClearView Windows", "SecureEntrance Doors", "EcoPane Solutions", "BrightAccess Installations", "OpenWay Doors & Windows"],
                "comments": [
                    "Energy-efficient windows for thermal control.",
                    "Secure and durable doors for safety.",
                    "Custom designs to enhance aesthetic appeal."
                ]
            },
            "Heating, Ventilation, and Air Conditioning (HVAC) systems": {
                "vendors": ["ClimateControl HVAC Co.", "EcoVent Systems", "PureAir Solutions", "ComfortZone HVAC", "ThermalTech Installations"],
                "comments": [
                    "High-efficiency systems for energy savings.",
                    "Advanced air filtration for health and comfort.",
                    "Smart systems for climate control and monitoring."
                ]
            }
        }
    },
    "Timber Tract Operations": {
        "activities": {
            "Tree planting and reforestation": {
                "vendors": ["GreenSeed Reforestation", "EcoGrow Forestry", "TreeRevive Operations", "ForestRenew Planters", "SaplingStart Co."],
                "comments": [
                    "Ensuring sustainability of timber resources.",
                    "Specializing in native species to promote biodiversity.",
                    "Utilizing GPS mapping for optimal planting strategies."
                ]
            },
            "Timber harvesting operations": {
                "vendors": ["LogCutters Inc.", "TimberYield Harvesting", "WoodFell Solutions", "ForestExtract Co.", "ClearCut Logging"],
                "comments": [
                    "Precision cutting to minimize waste.",
                    "Adhering to sustainable harvest practices.",
                    "Implementing low-impact logging techniques."
                ]
            },
            "Forest management and conservation": {
                "vendors": ["EcoForest Management", "GreenCanopy Consultants", "BioDiverse Forestry", "ConservForest Services", "SustainWood Group"],
                "comments": [
                    "Balancing timber production with ecosystem health.",
                    "Developing long-term forest management plans.",
                    "Monitoring health and growth of forest tracts."
                ]
            },
            "Pest and disease control": {
                "vendors": ["PestAway TimberCare", "TreeGuardians Co.", "ForestShield Services", "EcoPest Forestry", "TimberHealth Solutions"],
                "comments": [
                    "Employing environmentally friendly pest control methods.",
                    "Regular monitoring for early disease detection.",
                    "Implementing integrated pest management (IPM) strategies."
                ]
            },
            "Timber valuation and appraisal": {
                "vendors": ["WoodWorth Appraisals", "TimberValue Consultants", "EstateLogs Valuation", "TreeEquity Services", "LumberGrade Assessors"],
                "comments": [
                    "Accurate valuation for sales and acquisitions.",
                    "Expert appraisals based on market trends.",
                    "Providing comprehensive forestry investment analysis."
                ]
            },
            "Road construction and maintenance for logging": {
                "vendors": ["LogPath Engineers", "ForestRoads Construction", "TimberTrail Co.", "EcoAccess Routes", "WoodWay Builders"],
                "comments": [
                    "Building access roads with minimal environmental impact.",
                    "Maintaining roads for safety and sustainability.",
                    "Utilizing erosion control measures in road design."
                ]
            },
            "Timber marketing and sales": {
                "vendors": ["WoodMarket Solutions", "LogSales Network", "TimberTrade Co.", "ForestProducts Exchange", "EcoTimber Sales"],
                "comments": [
                    "Connecting sellers with global markets.",
                    "Promoting sustainable timber products.",
                    "Implementing traceability for timber origin."
                ]
            },
            "Wildlife habitat management": {
                "vendors": ["WildHabitat Forestry", "EcoFauna Management", "BioHaven Wildlife Services", "NatureBalance Co.", "HabitatHarmony Solutions"],
                "comments": [
                    "Integrating wildlife conservation into timber operations.",
                    "Creating buffer zones to protect sensitive species.",
                    "Monitoring biodiversity as a key management metric."
                ]
            },
            "Water resource management on timber lands": {
                "vendors": ["AquaForest Management", "StreamGuard Services", "EcoWatershed Co.", "TimberSpring WaterCare", "ForestRivers Solutions"],
                "comments": [
                    "Protecting water quality through best management practices.",
                    "Restoring stream and river habitats affected by logging.",
                    "Implementing rainwater harvesting for drought resilience."
                ]
            },
            "Fire prevention and control measures": {
                "vendors": ["FlameWard Forestry", "FireBreak Solutions", "EcoFireGuard Co.", "BlazeControl Services", "WildfirePrevent Consultants"],
                "comments": [
                    "Strategic creation of firebreaks and buffer zones.",
                    "Utilizing controlled burns for underbrush management.",
                    "Deploying early detection systems for rapid response."
                ]
            }
        }
    },
    "Uranium-Radium-Vanadium Ore Mining": {
        "activities": {
            "Uranium ore extraction": {
                "vendors": ["UraniumExtract Ltd.", "EcoUranium Solutions", "PureUranium Co.", "RadMine Operations", "VanadoUranium Group"],
                "comments": [
                    "High-grade uranium extraction with minimal environmental impact.",
                    "Implementing advanced safety measures for workers.",
                    "Utilizing innovative technologies for efficient ore processing."
                ]
            },
            "Radium isolation processes": {
                "vendors": ["RadiumTech Innovations", "EcoRad Solutions", "RadPurity Extractors", "IsolateRadium Co.", "RadiantRadium Services"],
                "comments": [
                    "Specialized techniques for radium isolation from ore.",
                    "Ensuring environmental safety in radium processing.",
                    "Conducting thorough radiation safety assessments."
                ]
            },
            "Vanadium mining and processing": {
                "vendors": ["VanadiumValley Co.", "EcoVan Mining", "VanaTech Processing", "PureVan Extracts", "InnoVanadium Solutions"],
                "comments": [
                    "Extracting vanadium with high purity levels.",
                    "Focusing on sustainable mining practices.",
                    "Applying state-of-the-art processing technology."
                ]
            },
            "Ore crushing and milling": {
                "vendors": ["CrushMill Ore Services", "EcoCrush Technologies", "MegaMillers Ltd.", "OreGrind Solutions", "FineCrush Co."],
                "comments": [
                    "Efficient ore crushing for optimal extraction.",
                    "Reducing energy consumption in milling operations.",
                    "Innovating in dust control techniques during processing."
                ]
            },
            "Leaching process for uranium extraction": {
                "vendors": ["LeachPro Uranium Services", "EcoLeach Solutions", "UraLeach Tech", "PureExtract Leaching", "LeachWell Uranium"],
                "comments": [
                    "Maximizing uranium yield through advanced leaching.",
                    "Minimizing environmental footprint of leaching operations.",
                    "Enhancing safety protocols in chemical leaching processes."
                ]
            },
            "Radiation safety and environmental monitoring": {
                "vendors": ["RadSafe Monitoring Co.", "EnviroRad Services", "SafeRad Technologies", "EcoMonitor Radiation", "RadGuard Analytics"],
                "comments": [
                    "Comprehensive radiation safety measures in place.",
                    "Continuous environmental monitoring for public safety.",
                    "Implementing best practices in radiation control."
                ]
            },
            "Tailings management and disposal": {
                "vendors": ["TailingsCare Solutions", "EcoTails Management", "SafeDisposal Systems", "TailingsTech Co.", "EnviroTailings Services"],
                "comments": [
                    "Innovative tailings management for sustainability.",
                    "Implementing secure disposal methods.",
                    "Focus on reducing tailings pond footprint."
                ]
            },
            "Water treatment in mining operations": {
                "vendors": ["AquaMine Solutions", "CleanWater Mining Co.", "EcoWater Treatment", "WaterPure Systems", "MineWaterTech"],
                "comments": [
                    "Advanced water treatment for mining effluents.",
                    "Ensuring water quality exceeds environmental standards.",
                    "Recycling and reusing water within mining operations."
                ]
            },
            "Transport and logistics for mined ore": {
                "vendors": ["OreTrans Logistics", "EcoTrans Mining Co.", "SafeCargo Solutions", "MineMovers Transport", "RadVan Logistics"],
                "comments": [
                    "Efficient transport solutions for uranium ore.",
                    "Implementing eco-friendly logistics practices.",
                    "Enhanced safety measures for radioactive material transport."
                ]
            },
            "Mine reclamation and ecosystem restoration": {
                "vendors": ["ReclaimEarth Services", "EcoRestore Mining", "GreenMine Reclamation", "RestoreLand Co.", "MineHeal Environmental"],
                "comments": [
                    "Restoring mined lands to their natural state.",
                    "Promoting biodiversity in mine reclamation projects.",
                    "Utilizing native plants in ecosystem restoration."
                ]
            }
        }
    },
    "Automobile Driving Schools": {
        "activities": {
            "Beginner driving lessons": {
                "vendors": ["StartRight Driving School", "FirstGear Lessons", "NewDriver Academy", "RoadBasics School", "DriveStart Educators"],
                "comments": [
                    "Tailored to new drivers for a solid foundation.",
                    "Emphasizes safety and road awareness from day one.",
                    "Instructors certified with a focus on patience and clarity."
                ]
            },
            "Advanced driving techniques": {
                "vendors": ["ProDrive Techniques", "AdvancedWheel School", "SkillShift Academy", "EliteDrivers Course", "PrecisionPilots Lessons"],
                "comments": [
                    "Covers defensive driving, evasive maneuvers, and more.",
                    "Designed for experienced drivers to enhance skills.",
                    "Utilizes simulation and on-road training for mastery."
                ]
            },
            "Defensive driving courses": {
                "vendors": ["SafeGuard Driving", "DefendDrive School", "ShieldOn Driving Academy", "FortressWheel Training", "DefensiveRoads School"],
                "comments": [
                    "Focuses on anticipating and avoiding road hazards.",
                    "Incorporates latest safety techniques and technologies.",
                    "Reduces risk of accidents through proactive training."
                ]
            },
            "Driving test preparation": {
                "vendors": ["TestReady Driving School", "PassFirst Go", "LicenseQuest Prep", "DriveTest Success Academy", "ExamGear Drivers"],
                "comments": [
                    "Comprehensive review and mock tests for confidence.",
                    "Highlights common test pitfalls and how to avoid them.",
                    "Personalized coaching based on student's weaknesses."
                ]
            },
            "Eco-driving lessons": {
                "vendors": ["EcoWheel Education", "GreenDrive School", "SustainaRide Lessons", "EcoPilot Academy", "CleanDrive Instructors"],
                "comments": [
                    "Teaches fuel-efficient driving habits.",
                    "Contributes to environmental protection and cost savings.",
                    "Incorporates hybrid and electric vehicle training."
                ]
            },
            "Vehicle maintenance workshops": {
                "vendors": ["AutoUpkeep Workshop", "MaintainMasters Class", "CarCare Clinic", "VehicleVitals Lessons", "TuneUp Teach"],
                "comments": [
                    "Hands-on learning for basic vehicle upkeep and emergency repairs.",
                    "Empowers drivers with knowledge to minimize maintenance costs.",
                    "Courses tailored for different vehicle types and models."
                ]
            },
            "Motorcycle driving lessons": {
                "vendors": ["TwoWheels Training Academy", "MotoLearn School", "BikeBasics Instructors", "CycleSafe Lessons", "RiderCraft Education"],
                "comments": [
                    "Specialized curriculum for aspiring motorcyclists.",
                    "Emphasizes balance, control, and road safety.",
                    "Includes gear selection and maintenance advice."
                ]
            },
            "Commercial driving license (CDL) training": {
                "vendors": ["CDLPro Academy", "HeavyHaulers School", "TruckMasters Training", "BigRig Educators", "CommercialDrivers Course"],
                "comments": [
                    "Prepares students for CDL exams with intensive training.",
                    "Covers commercial vehicle laws, log keeping, and cargo safety.",
                    "Includes behind-the-wheel practice with various truck types."
                ]
            },
            "Teen driving safety programs": {
                "vendors": ["YouthDrive Initiative", "TeenWheelers School", "SafeStart Young Drivers", "NextGen Drivers Academy", "GuardianRoad Teens"],
                "comments": [
                    "Designed to instill responsible driving habits early.",
                    "Interactive sessions with emphasis on peer influence.",
                    "Partners with schools and communities for outreach."
                ]
            },
            "Driver rehabilitation and retraining": {
                "vendors": ["DriveAgain Center", "WheelRecovery Services", "BackOnRoad Academy", "ReDrive Clinic", "SteerClear Program"],
                "comments": [
                    "Support for drivers recovering from injuries or disabilities.",
                    "Adapts teaching methods and vehicles to individual needs.",
                    "Focuses on restoring confidence and independence on the road."
                ]
            }
        }
    },
    "Offices of Dentists": {
        "activities": {
            "General dental check-ups": {
                "vendors": ["DentalCare Associates", "BrightSmile Clinics", "HealthyTeeth Dental", "OralWellness Providers", "SmileFirst Services"],
                "comments": [
                    "Emphasis on preventative care and oral hygiene.",
                    "Routine examinations to maintain dental health.",
                    "Utilizing the latest diagnostic technology for comprehensive check-ups."
                ]
            },
            "Cosmetic dentistry services": {
                "vendors": ["AestheticDental Arts", "CosmoSmile Dental Studio", "BrightenDent Aesthetics", "SmileDesign Experts", "GlamourSmiles Clinic"],
                "comments": [
                    "Specializing in smile makeovers and aesthetic improvements.",
                    "Offering a range of services from whitening to veneers.",
                    "Personalized cosmetic plans to enhance patients' smiles."
                ]
            },
            "Orthodontic treatments": {
                "vendors": ["AlignOrtho Care", "BraceBright Orthodontics", "StraightPath Dental", "HarmonyOrtho Solutions", "PerfectSmile Braces"],
                "comments": [
                    "Expertise in teeth alignment and bite correction.",
                    "Utilizing modern braces and clear aligner technologies.",
                    "Tailored orthodontic plans for all age groups."
                ]
            },
            "Pediatric dentistry": {
                "vendors": ["LittleSmiles Pediatric", "KidsDental Zone", "TinyTeeth Specialists", "YouthfulBites Dental", "ChildCare Dentistry"],
                "comments": [
                    "Dedicated care for children's dental health.",
                    "Creating a friendly and reassuring environment for young patients.",
                    "Focusing on preventive care and oral health education."
                ]
            },
            "Periodontal disease treatment": {
                "vendors": ["GumGuardians Clinic", "PerioProtect Services", "DeepClean Dental", "GumHealth Specialists", "BeneathTheGums Care"],
                "comments": [
                    "Advanced treatment options for gum disease.",
                    "Personalized care plans to halt disease progression.",
                    "Emphasis on minimally invasive procedures for gum health."
                ]
            },
            "Dental implant services": {
                "vendors": ["ImplantInnovations Dental", "AnchorDent Implants", "FoundationTeeth Solutions", "PermanentSmiles Clinic", "ToothRoot Systems"],
                "comments": [
                    "State-of-the-art dental implant solutions.",
                    "Restoring functionality with a natural look and feel.",
                    "Comprehensive care from consultation to post-operative follow-up."
                ]
            },
            "Emergency dental services": {
                "vendors": ["RapidRelief Dental Care", "EmergencyDent Assist", "24Hour Dental Clinic", "UrgentCare ToothSavers", "ImmediateHelp Dentists"],
                "comments": [
                    "Prompt care for dental emergencies and injuries.",
                    "Available around the clock for urgent dental needs.",
                    "Equipped to handle a wide range of dental emergencies."
                ]
            },
            "Dental hygiene services": {
                "vendors": ["CleanBite Hygiene", "FreshMouth Dental Care", "HygieneMasters Clinic", "PlaqueFighters Services", "BrightGleam Cleanings"],
                "comments": [
                    "Professional cleaning to prevent cavities and gum disease.",
                    "Educating patients on effective oral hygiene practices.",
                    "Utilizing gentle techniques for a comfortable experience."
                ]
            },
            "Endodontic (root canal) therapy": {
                "vendors": ["RootCare Specialists", "EndoHeal Dental", "PainFreeRoots Clinic", "CanalCure Endodontics", "InsideTooth Care"],
                "comments": [
                    "Expert care for root canal therapy and tooth pain relief.",
                    "Utilizing advanced techniques for successful outcomes.",
                    "Focus on patient comfort and preserving natural teeth."
                ]
            },
            "Dental X-ray and imaging services": {
                "vendors": ["ImageDent Diagnostics", "RadiantSmile Imaging", "XrayVision Dental", "OralScan Services", "DentoGraphix Clinic"],
                "comments": [
                    "High-definition imaging for accurate diagnostics.",
                    "Minimally invasive X-ray techniques for patient safety.",
                    "Comprehensive imaging services to support dental treatments."
                ]
            }
        }
    },
    "Kidney Dialysis Centers": {
        "activities": {
            "Hemodialysis treatment": {
                "vendors": ["CleanFilter Dialysis", "HemoCare Services", "PureBlood Solutions", "LifeStream Dialysis", "RenalTech Centers"],
                "comments": [
                    "State-of-the-art hemodialysis equipment for efficient blood filtering.",
                    "Personalized care plans tailored to individual patient needs.",
                    "Continuous monitoring and adjustment for optimal treatment outcomes."
                ]
            },
            "Peritoneal dialysis support": {
                "vendors": ["HomeDialysis Plus", "PeriCare Solutions", "InnerCleanse Therapy", "GentleDialysis Supplies", "StayPure Systems"],
                "comments": [
                    "Comprehensive support for at-home peritoneal dialysis patients.",
                    "Training and ongoing assistance for patients and families.",
                    "Supply of high-quality dialysis fluids and equipment."
                ]
            },
            "Kidney health education": {
                "vendors": ["RenalEd Partners", "KidneyWise Academy", "HealthStream Education", "NephroKnow Institute", "RenalAware Program"],
                "comments": [
                    "Empowering patients with knowledge about kidney health maintenance.",
                    "Workshops on diet, lifestyle, and disease management for renal patients.",
                    "Collaborations with healthcare professionals for comprehensive education."
                ]
            },
            "Dialysis access management": {
                "vendors": ["AccessPoint Care", "VascularAccess Solutions", "FlowGuard Management", "DialyAccess Clinics", "VeinCare Dialysis Services"],
                "comments": [
                    "Expert care in creating and maintaining dialysis access sites.",
                    "Minimally invasive procedures to maximize treatment efficacy.",
                    "Regular assessments to ensure access site health and function."
                ]
            },
            "Transplant coordination services": {
                "vendors": ["TransplantLink Coordination", "NewLife Transplant Services", "MatchOrgan Network", "HopeTransplant Connect", "LifeGift Transplant Support"],
                "comments": [
                    "Guidance through the kidney transplant process.",
                    "Support for patients on the transplant waiting list.",
                    "Collaboration with transplant centers for seamless patient care."
                ]
            },
            "Nutritional counseling for dialysis patients": {
                "vendors": ["NutriRenal Advisors", "DietWellness Renal", "KidneyNutri Care", "BalancedBites Counseling", "RenalDiet Solutions"],
                "comments": [
                    "Tailored dietary plans to support kidney health and dialysis treatment.",
                    "Professional advice on managing fluid and mineral intake.",
                    "Regular follow-ups to adjust dietary plans as needed."
                ]
            },
            "Psychological support services": {
                "vendors": ["MindRenal Support", "DialyzeMind Wellness", "RenalSpirit Counseling", "PsycheRenal Health", "EmbraceWellness Therapy"],
                "comments": [
                    "Emotional and psychological support tailored for renal patients.",
                    "Counseling services to help cope with the challenges of chronic kidney disease.",
                    "Group therapy sessions and peer support networks."
                ]
            },
            "Anemia management in dialysis": {
                "vendors": ["HemoBoost Therapies", "IronFlow Treatments", "AnemiaCare Dialysis Support", "BloodHealth Solutions", "VitalHem Solutions"],
                "comments": [
                    "Comprehensive anemia management protocols for dialysis patients.",
                    "Monitoring and treatment with iron supplements and EPO therapy.",
                    "Individualized care plans to address the root causes of anemia."
                ]
            },
            "Fluid management technology": {
                "vendors": ["HydraBalance Tech", "FluidWise Systems", "AquaControl Dialysis", "FlowRegulate Innovations", "Liquidus Management Devices"],
                "comments": [
                    "Advanced technologies for precise fluid removal during dialysis.",
                    "Customizable treatment settings to meet patient-specific needs.",
                    "Continuous innovation in fluid management for enhanced patient safety."
                ]
            },
            "Infection prevention and control": {
                "vendors": ["CleanDialysis Environments", "InfectoGuard Protocols", "SteriRenal Systems", "SafeDialyze Practices", "PathoShield Measures"],
                "comments": [
                    "Rigorous infection control measures to ensure patient safety.",
                    "Implementation of CDC guidelines in all treatment and common areas.",
                    "Regular training for staff on hygiene and infection prevention."
                ]
            }
        }
    },
    "Historical Sites": {
        "activities": {
            "Preservation and restoration projects": {
                "vendors": ["HeritagePreserve Constructors", "PastRenew Partners", "EraRestore Solutions", "TimeKeepers Restoration", "LegacyBuilders Co."],
                "comments": [
                    "Dedicated to maintaining the authenticity of historical structures.",
                    "Utilizing traditional materials and techniques for restoration.",
                    "Collaborating with historians to ensure historical accuracy."
                ]
            },
            "Educational programs and tours": {
                "vendors": ["HistoryWalks Education", "TimeTales Tours", "PastPaths Guides", "EduJourneys Historical", "HeritageExplorers Co."],
                "comments": [
                    "Engaging and informative tours for all age groups.",
                    "Customized educational programs for schools and universities.",
                    "Utilizing technology to enhance the learning experience."
                ]
            },
            "Exhibit design and installation": {
                "vendors": ["EraDisplays Solutions", "ExhibitPast Creators", "TimelineDesigns Co.", "HistoriCraft Installations", "ShowcaseHistory Partners"],
                "comments": [
                    "Creating immersive and interactive exhibits.",
                    "Highlighting significant historical events and figures.",
                    "Incorporating multimedia elements for dynamic presentations."
                ]
            },
            "Conservation research and studies": {
                "vendors": ["HeritageResearch Lab", "ConservaStudies Group", "AncientAnalytics Co.", "PastProbe Researchers", "TimeTested Science"],
                "comments": [
                    "Advanced research for preserving historical artifacts.",
                    "Developing new methods for long-term conservation.",
                    "Collaborating with academic institutions for comprehensive studies."
                ]
            },
            "Cultural heritage events": {
                "vendors": ["CulturaFest Organizers", "HeritageHappenings Events", "EpochEvents Planning", "AncestralCelebrations Co.", "TraditionsAlive LLC"],
                "comments": [
                    "Showcasing traditional crafts, music, and dances.",
                    "Promoting understanding and appreciation of cultural diversity.",
                    "Annual events to celebrate and preserve local heritage."
                ]
            },
            "Archaeological excavation support": {
                "vendors": ["DigDeep Archaeology", "PastLayers Excavations", "ArtifactFinders Co.", "HistoryUnearthed Services", "GroundStories LLC"],
                "comments": [
                    "Expert teams for sensitive archaeological digs.",
                    "Uncovering and documenting historical artifacts.",
                    "Working closely with historians and archaeologists for site preservation."
                ]
            },
            "Historical documentation and archiving": {
                "vendors": ["ArchiveMasters Services", "EraDocs Solutions", "PastRecords Co.", "HeritageFiles Organization", "TimeCapsule Archives"],
                "comments": [
                    "Digitizing and preserving historical documents.",
                    "Creating accessible archives for research and education.",
                    "Ensuring the longevity of valuable historical records."
                ]
            },
            "Landscape restoration and maintenance": {
                "vendors": ["HeritageGrounds Landscaping", "HistoricSites Gardening", "PastScapes Management", "EraGardens Care", "LegacyLawns Services"],
                "comments": [
                    "Restoring historical site landscapes to their original state.",
                    "Maintaining the natural beauty and historical integrity of sites.",
                    "Employing eco-friendly practices for sustainable site management."
                ]
            },
            "Historical site marketing and promotion": {
                "vendors": ["HistoryHighlights Marketing", "PastPromos Co.", "EraAttractions Advertising", "HeritageBuzz Solutions", "LegacyLure Campaigns"],
                "comments": [
                    "Raising awareness and interest in historical sites.",
                    "Developing engaging marketing campaigns for increased visitation.",
                    "Utilizing social media to reach a broader audience."
                ]
            },
            "Visitor services and amenities": {
                "vendors": ["TimeTraveler Amenities", "VisitorVault Services", "HeritageHosts Co.", "PastComforts Facilities", "HistoryHaven Concessions"],
                "comments": [
                    "Enhancing the visitor experience with quality amenities.",
                    "Providing informative and friendly services for guests.",
                    "Offering refreshments and merchandise inspired by historical themes."
                ]
            }
        }
    },
    "Bowling Centers": {
        "activities": {
            "League play organization": {
                "vendors": ["StrikeMasters Leagues", "PinChampions Coordinators", "BowlLeague Creators", "AlleyChamps Organizers", "KingpinLeagues Services"],
                "comments": [
                    "Hosting competitive leagues for all skill levels.",
                    "Creating a community atmosphere with regular play.",
                    "Organizing seasonal and themed bowling tournaments."
                ]
            },
            "Youth bowling programs": {
                "vendors": ["JuniorRollers Academy", "KidsBowl Club", "FutureStrikers Youth", "PinPals Junior Leagues", "LittleBowler Coaching"],
                "comments": [
                    "Introducing children to the sport in a fun, supportive environment.",
                    "Teaching fundamentals and sportsmanship through structured programs.",
                    "Hosting family-friendly events and youth tournaments."
                ]
            },
            "Bowling equipment sales and rental": {
                "vendors": ["ProShop Gear", "StrikeZone Equipment", "AlleyEquip Sales", "BowlEssentials Store", "PinGear Rentals"],
                "comments": [
                    "Offering a wide range of bowling balls, shoes, and accessories.",
                    "Providing expert advice for equipment selection.",
                    "Featuring the latest technology in bowling gear for optimal performance."
                ]
            },
            "Corporate and group events": {
                "vendors": ["GroupBowl Events", "CorporateStrikes Planners", "TeamPin Gatherings", "EventBowl Solutions", "StrikeTeam Organizing"],
                "comments": [
                    "Tailoring packages for team-building and corporate outings.",
                    "Offering private lanes and meeting spaces for groups.",
                    "Customizing events with catering and entertainment options."
                ]
            },
            "Bowling instruction and coaching": {
                "vendors": ["BowlCoach Institute", "StrikeSkill Trainers", "PinPros Lessons", "AlleyMasters Coaching", "FrameUp Instruction"],
                "comments": [
                    "Improving skills through personalized coaching sessions.",
                    "Catering to both beginners and advanced players.",
                    "Utilizing video analysis for technique improvement."
                ]
            },
            "Food and beverage services": {
                "vendors": ["LaneSnacks Cafe", "PinBites Grill", "BowlBar Beverages", "StrikeEats Restaurant", "GutterGourmet Catering"],
                "comments": [
                    "Enhancing the bowling experience with quality dining options.",
                    "Featuring a menu of favorite foods and craft beverages.",
                    "Hosting themed food nights and happy hour specials."
                ]
            },
            "Bowling alley maintenance and technology": {
                "vendors": ["PinSet Mechanics", "LaneTech Solutions", "FrameFix Services", "AlleyCare Maintenance", "BowlSys Tech"],
                "comments": [
                    "Ensuring optimal lane conditions with regular maintenance.",
                    "Incorporating the latest scoring and pinsetter technology.",
                    "Offering a seamless experience with well-maintained equipment."
                ]
            },
            "Special events and themed nights": {
                "vendors": ["ThemeBowl Nights", "AlleyFest Events", "CosmicBowl Parties", "RetroRoll Back", "GlowPin Evenings"],
                "comments": [
                    "Hosting unique themed nights for memorable experiences.",
                    "Attracting diverse crowds with costume, music, and decade nights.",
                    "Creating a vibrant and entertaining atmosphere for all ages."
                ]
            },
            "Membership and loyalty programs": {
                "vendors": ["BowlClub Memberships", "PinPerks Loyalty", "StrikeSavers Club", "FrameRewards Program", "AlleyAdvantage Benefits"],
                "comments": [
                    "Offering exclusive benefits and discounts to members.",
                    "Building a loyal community with rewards and recognition.",
                    "Providing special offers and early access to events."
                ]
            },
            "Arcade and additional entertainment": {
                "vendors": ["ArcadeStrike Zone", "PinPlay Arcade", "AlleyGames Entertainment", "Bowl&Play Centers", "GameFrame Arcade"],
                "comments": [
                    "Complementing bowling with a variety of arcade games.",
                    "Catering to families and younger guests with diverse entertainment.",
                    "Regularly updating game selections for fresh experiences."
                ]
            }
        }
    },
    "Political Organizations": {
        "activities": {
            "Voter education and registration drives": {
                "vendors": ["VoteReady Campaigns", "ElectAware Initiatives", "DemocracyBoost Org", "RegisterNow Networks", "CivicDuty Partners"],
                "comments": [
                    "Enhancing public awareness on the importance of voting.",
                    "Facilitating easy access to voter registration resources.",
                    "Organizing community workshops for informed electoral participation."
                ]
            },
            "Campaign strategy and management": {
                "vendors": ["StrategyPol Consultants", "CampaignEdge Solutions", "ElectionWin Advisors", "PolManage Experts", "VoteCraft Strategies"],
                "comments": [
                    "Developing comprehensive campaign strategies for candidates.",
                    "Providing end-to-end campaign management services.",
                    "Utilizing data analytics for targeted voter outreach."
                ]
            },
            "Political advocacy and lobbying": {
                "vendors": ["AdvocateVoice Group", "PolicyPush Lobbyists", "ChangeMakers Coalition", "ActionAgenda Advocates", "CivicInfluence Partners"],
                "comments": [
                    "Representing constituent interests at legislative levels.",
                    "Engaging in policy advocacy for social and political change.",
                    "Building coalitions for broader impact on public policy."
                ]
            },
            "Fundraising and donor management": {
                "vendors": ["FundFuture Campaigns", "DonateWell Services", "ElevateFunds Platform", "PledgeProspect Initiative", "SupporterSync Solutions"],
                "comments": [
                    "Implementing effective fundraising strategies for political causes.",
                    "Managing donor relations and recurring contribution programs.",
                    "Leveraging digital platforms for campaign financing."
                ]
            },
            "Social media and digital campaigning": {
                "vendors": ["DigitalVoice Campaigns", "SocialSphere Strategies", "NetInfluence Marketing", "OnlineElect Solutions", "WebCampaign Creators"],
                "comments": [
                    "Maximizing online presence for political campaigns.",
                    "Engaging voters through targeted social media content.",
                    "Analyzing digital outreach efforts for optimized engagement."
                ]
            },
            "Issue-based advocacy campaigns": {
                "vendors": ["CauseChampion Org", "IssueAdvocates Network", "RightFocus Campaigns", "UnityVoice Coalition", "ActionForChange Group"],
                "comments": [
                    "Focusing on key social, environmental, and economic issues.",
                    "Raising public awareness and support for legislative action.",
                    "Mobilizing grassroots movements for policy advocacy."
                ]
            },
            "Political event planning and coordination": {
                "vendors": ["EventPolitik Planners", "RallyReady Coordinators", "DebateStage Organizers", "TownHall Productions", "ElectEvents Company"],
                "comments": [
                    "Organizing rallies, debates, and town hall meetings.",
                    "Coordinating logistics for large-scale political events.",
                    "Creating impactful experiences for candidate support and advocacy."
                ]
            },
            "Public relations and media outreach": {
                "vendors": ["MediaMatters Relations", "PressPoint Communications", "PublicVoice PR", "CampaignSpotlight PR", "ElectorateEngage Media"],
                "comments": [
                    "Managing media relations to shape public perception.",
                    "Crafting press releases and managing news coverage.",
                    "Strategizing media outreach to elevate political profiles."
                ]
            },
            "Research and policy analysis": {
                "vendors": ["PolicyInsight Research", "ElectAnalytica Thinktank", "VoteMetrics Analysis", "CivicStudies Institute", "StrategySphere Analytics"],
                "comments": [
                    "Conducting research to inform policy positions and advocacy.",
                    "Analyzing electoral trends and voter behavior.",
                    "Providing strategic insights for campaign and policy development."
                ]
            },
            "Grassroots organizing and mobilization": {
                "vendors": ["GrassrootsEmpower Network", "MobilizeAction Group", "CommunityVoice Organizers", "RootsRise Movements", "CivicMobilize Collective"],
                "comments": [
                    "Building community networks for political engagement.",
                    "Empowering volunteers through training and resources.",
                    "Organizing door-to-door campaigns and local events for voter outreach."
                ]
            }
        }
    }
}



In [None]:
import pandas as pd
import numpy as np

def generate_record(activity, vendor, cost, comment, naics_title, csv_columns):
    """
    Dynamically generate a record based on the provided parameters and csv_columns.

    Parameters:
    - activity: The business activity description.
    - vendor: The vendor name.
    - comment: The comment associated with the activity.
    - naics_title: The 2017 NAICS Title.
    - csv_columns: List of column names.
    - cost: The cost in USD.

    Returns:
    - A dictionary representing a single record.
    """
    # Initialize record with column names from csv_columns
    record = dict.fromkeys(csv_columns, "")

    # Dynamically assign values to keys based on their order in csv_columns
    record[csv_columns[0]] = activity
    record[csv_columns[1]] = vendor
    record[csv_columns[2]] = cost
    record[csv_columns[3]] = comment
    record[csv_columns[4]] = naics_title

    return record

def generate_dataset(naics_info, csv_columns):
    records = []
    for index in range(10):
      for naics_title, details in naics_info.items():
          for activity, info in details['activities'].items():
              for vendor in info['vendors']:
                  for comment in info['comments']:
                      # Generating a random cost for each record
                      cost_usd = np.random.random() * 1000
                      # Generate each record using the updated function
                      record = generate_record(activity, vendor, cost_usd, comment, naics_title, csv_columns)
                      records.append(record)
    return pd.DataFrame(records, columns=csv_columns)

# Assuming naics_info is your structured data for NAICS information
dataset = generate_dataset(naics_info, csv_columns)

# Split into training and test sets
train_dataset = dataset.sample(frac=0.833, random_state=42)
test_dataset = dataset.drop(train_dataset.index)

# Save to CSV files
# Upload or update the training and test datasets
upload_dataframe(drive, folder_id, train_dataset, total_file_name)
upload_dataframe(drive, folder_id, train_dataset, train_file_name)
upload_dataframe(drive, folder_id, test_dataset, test_file_name)

# train_dataset.to_csv(file_folder+'sustainability_business_activities_training.csv', index=False)
# test_dataset.to_csv(file_folder+'sustainability_business_activities_test.csv', index=False)

print(f"Dataset generated with {len(train_dataset)} training records, {len(test_dataset)} test records for {len(naics_info.items())} titles.")


NameError: name 'naics_info' is not defined

In [None]:
train_file_name = 'sustainability_business_activities_training.csv'
test_file_name = 'sustainability_business_activities_test.csv'
EPA_file_name = 'EPA_EmissionData.csv'


# Define the fields to potentially introduce errors
fields = ['Business Activity Description', 'Business Activity Vendor', 'Business Activity Comment']
num_max_error_columns = 1
naics_title_column_name = '2017 NAICS Title'
activity_cost_column_name = 'Business Activity Cost USD'
epa_emission_factor_column_name = 'Supply Chain Emission Factors without Margins'
separator_string = " [SEP] "
error_column_append_text = " Error"
combined_text_column_name = 'combined_text'
encoded_label_column_name = 'encoded_labels'

demo_mode = True

def find_file_id_by_name(drive, folder_id, file_name):
    """Search for a file by name in the specified Google Drive folder."""
    query = f"'{folder_id}' in parents and trashed=false and title='{file_name}'"
    file_list = drive.ListFile({'q': query}).GetList()
    return file_list[0]['id'] if file_list else None

epa_file_id = find_file_id_by_name(drive, folder_id, EPA_file_name)
epa_data = drive.CreateFile({'id': epa_file_id})
epa_data.GetContentFile(EPA_file_name)
factor_df = pd.read_csv(EPA_file_name)
print(f"{len(factor_df)} of rows in factor csv")
# print(factor_df.head())

train_file_id = find_file_id_by_name(drive, folder_id, train_file_name)
activity_data = drive.CreateFile({'id': train_file_id})
activity_data.GetContentFile(train_file_name)
activity_df = pd.read_csv(train_file_name)
print(f"{len(activity_df)} of rows in training csv")
# print(activity_df.head())

# Load the test data
test_file_id = find_file_id_by_name(drive, folder_id, test_file_name)
activity_test_data = drive.CreateFile({'id': test_file_id})
activity_test_data.GetContentFile(test_file_name)
test_df = pd.read_csv(test_file_name)
print(f"{len(test_df)} of rows in test csv")
# print(test_df.head())

NameError: name 'drive' is not defined

In [None]:
label_dict = {value: idx for idx, value in enumerate(activity_df[naics_title_column_name].unique())}
activity_df[encoded_label_column_name] = activity_df[naics_title_column_name].map(label_dict)

In [None]:
import pandas as pd
import numpy as np
import random

# Helper functions
def introduce_minor_errors(text):
    """Introduce minor spelling mistakes in the text."""
    errors_introduced = 0
    max_errors = random.randint(2, 3)  # Decide to introduce 2 or 3 minor errors

    while errors_introduced < max_errors and len(text) > 4:  # Ensure text is long enough to alter
        error_type = random.choice(['substitute', 'omit', 'swap'])
        error_index = random.randint(1, len(text) - 2)  # Avoid beginning and end of the text for simplicity

        if error_type == 'substitute':
            # Substitute a character with a nearby character (mimicking common typing errors)
            substitutions = {'a': 's', 's': 'a', 'd': 'f', 'i': 'o', 'o': 'p', 'e': 'r', 'r': 't'}
            if text[error_index] in substitutions:
                text = text[:error_index] + substitutions[text[error_index]] + text[error_index + 1:]
                errors_introduced += 1

        elif error_type == 'omit':
            # Omit a character
            text = text[:error_index] + text[error_index + 1:]
            errors_introduced += 1

        elif error_type == 'swap':
            # Swap two adjacent characters
            if error_index < len(text) - 1:  # Ensure there's a character to swap with
                text = text[:error_index] + text[error_index + 1] + text[error_index] + text[error_index + 2:]
                errors_introduced += 1
    return text

def introduce_major_errors(text):
    """Replace or scramble parts of the text to introduce major errors."""
    # Randomly choose between scrambling or inserting irrelevant text
    if random.random() < 0.5:
        return ''.join(random.sample(text, len(text)))
    else:
        return "Irrelevant text " + ''.join(random.sample(text, len(text)))
    return text

# Function to randomly apply either minor or major errors to a text
def apply_random_error(text):
    if random.random() < 0.20:  # 15% chance to introduce an error
        #error_type = random.choice(['minor', 'major'])
        #if error_type == 'minor':
        #    return introduce_minor_errors(text)
        #else:
            return introduce_major_errors(text)   # only major errors
    return text


def apply_errors_with_limit(row, fields, max_errors=num_max_error_columns):
    """
    Randomly apply errors to a limited number of fields in a row.

    Parameters:
    - row: The DataFrame row to apply errors to.
    - fields: A list of field names to potentially apply errors to.
    - max_errors: Maximum number of fields to apply errors to.
    """
    # Randomly decide how many fields to apply errors to (0 to max_errors)
    errors_to_apply = random.randint(0, max_errors)

    # Randomly select the fields where errors will be applied
    fields_with_errors = random.sample(fields, errors_to_apply)

    # Apply errors to the selected fields
    for field in fields:
        if field in fields_with_errors:
            row[field + error_column_append_text] = apply_random_error(row[field])
        else:
            row[field + error_column_append_text] = row[field]

    return row

def combine_text_fields(row, fields):
    """
    Combine multiple text fields into a single combined text string.

    Parameters:
    - row: A DataFrame row containing the text fields.
    - fields: A list of field names to be combined.

    Returns:
    - combined_text: A string containing the combined text from the specified fields.
    """
    combined_parts = []
    for field in fields:
        # Assuming the 'Error' versions of fields are already in the DataFrame
        error_field_name = f"{field}{error_column_append_text}"
        if error_field_name in row:
            field_label = field.replace(" ", "_")  # Replace spaces with underscores for label
            combined_parts.append(f"{field_label}: {row[error_field_name]}")
    combined_text = separator_string.join(combined_parts)
    return combined_text

In [None]:
from datetime import datetime

current_date = datetime.now()
# Format month as 3-letter abbreviation and day as a number
formatted_date = current_date.strftime('%b%d')

# Apply errors to 2 or fewer fields for each row
activity_df = activity_df.apply(lambda row: apply_errors_with_limit(row, fields), axis=1)
# Apply the function to each row of the DataFrame to create the 'combined_text' column
activity_df[combined_text_column_name] = activity_df.apply(lambda row: combine_text_fields(row, fields), axis=1)
# Now, 'combined_text' contains the concatenated texts with either minor or major errors introduced
temp_train_file_name = 'sustainability_business_activities_training_with_bad_errors.csv'

# Apply errors to 2 or fewer fields for each row
test_df = test_df.apply(lambda row: apply_errors_with_limit(row, fields), axis=1)
# Combine the possibly altered text fields into a new 'combined_text' column
test_df[combined_text_column_name] = test_df.apply(lambda row: combine_text_fields(row, fields), axis=1)
temp_test_file_name = 'sustainability_business_activities_test_with_bad_errors.csv'

upload_dataframe(drive, folder_id, activity_df, temp_train_file_name)
upload_dataframe(drive, folder_id, test_df, temp_test_file_name)

Updated: train_Mar12.csv
Updated: test_Mar12.csv


In [None]:
# Convert online datasets into txt file which we can use for finetuning chatpt 2
# might need to edit to skip certain tweets/lines
# current content that may be negatively influencing are:
# links (check if strings longer than 4 lenght start with http)
# usernames starting with @
# any lines that are non alphabetical chars (e.g # symbols or emojis)


# Function to find file IDs by names
def find_file_ids_by_names(drive, folder_id, file_names):
    file_ids = []
    for file_name in file_names:
        file_id = find_file_id_by_name(drive, folder_id, file_name)
        file_ids.append(file_id)
    return file_ids

# Function to upload text content to Google Drive
def upload_text_content(drive, folder_id, text_content, file_name):
    # Check if file exists
    file_id = find_file_id_by_name(drive, folder_id, file_name)

    if file_id:
        # File exists, update the content
        file = drive.CreateFile({'id': file_id})
        file.SetContentString(text_content)
        file.Upload()
        print(f"Updated: {file_name}")
    else:
        # File doesn't exist, create a new one
        file = drive.CreateFile({'title': file_name, 'parents': [{'id': folder_id}]})
        file.SetContentString(text_content)
        file.Upload()
        print(f"Created: {file_name}")

# Function to read content from Google Drive
def read_text_content_from_drive(drive, file_id):
    file = drive.CreateFile({'id': file_id})
    file_content = file.GetContentString()
    return file_content

# Function to process content from Google Drive
def process_and_upload_conll_content(drive, folder_id, file_ids, output_file_name):
    tweets_content = []

    # Process each CoNLL file by file ID
    for file_id in file_ids:
        file_content = read_text_content_from_drive(drive, file_id)
        for line in file_content.splitlines():
            columns = line.strip().split()
            if columns:
                tweets_content.append(columns[0])
            else:
                tweets_content.append('\n')

    # Convert list to string
    text_content = ' '.join(tweets_content)

    # Upload collected tweets to Google Drive
    upload_text_content(drive, folder_id, text_content, output_file_name)
    print(f"Uploaded {output_file_name} to Google Drive.")



# List of CoNLL file names to process
conll_file_names = ['a.conll', 'b.conll', 'c.conll', 'd.conll', 'e.conll', 'f.conll']
# Get file IDs for all file names
conll_file_ids = find_file_ids_by_names(drive, folder_id, conll_file_names)

output_file_name = "collected_tweets.txt"

# Call the function with your data
process_and_upload_conll_content(drive, folder_id, conll_file_ids, output_file_name)

Updated: collected_tweets.txt
Uploaded collected_tweets.txt to Google Drive.


In [None]:
#chatgpt 2 finetuning. takes less than 10 minutes
# also tried neo gpt 2.7 but too compute heavy


!pip install torch torchvision
!pip install sentence-transformers accelerate -U
!pip install transformers[torch] -U
!pip install transformers -U
!pip install accelerate -U

import transformers
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
import accelerate
from transformers import Trainer, TrainingArguments
from google.colab import drive as dr


print("Accelerate version:", accelerate.__version__)
print("Transformers version:", transformers.__version__)


dr.mount('/content/drive')
finetune_path = '/content/drive/My Drive/224_project/collected_tweets.txt'
output_dir = '/content/drive/My Drive//224_project/gpt2_finetuned'

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')


## gpt neo 2.7 close to gpt 3 but finetuning is too compute heavy, always fails
# tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-2.7B')
# model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-2.7B')


# Prepare the dataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=finetune_path,
    block_size=128)  # Maybe adjust block size?

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)

# Set up training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=5,  # Maybe adjust epochs?
    per_device_train_batch_size=4,  # Can be adjust?
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Start fine-tuning
trainer.train()

# Save the fine-tuned model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# inject errors using chagpt2. tired both finetuned and not finetuned.
# also tried gpt 2.7. experiemented with paramters, prompts, no prompts,
# but the model is not smart enough to recreate the original cell
# content with errors. Given the cell, "hello world" regardless of good prompt
# preapended or not, it generates "hello world" identically mathcing first and
# then it can generates errors/language that does not make sense if finetuned
# or it just continues the sentence if not finetuned


import pandas as pd
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, pipeline
# import openai
from google.colab import drive as dr


dr.mount('/content/drive')

#openai.api_key = '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'


fields = ['Business Activity Description', 'Business Activity Vendor', 'Business Activity Comment']
combined_text_column_name = 'combined_text'

# Load the fine-tuned gpt2 model for text generation,
model_path = '/content/drive/My Drive/224_project/gpt2_finetuned'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)


# tried using gpt 2 without finetuning,
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# model = GPT2LMHeadModel.from_pretrained('gpt2')



## gpt neo 2.7 without finetuning
# tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-2.7B')
# model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-2.7B')


text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)


prompt = """Below are sentences followed by versions with common typing errors,
grammatical mistakes, or word misuses that people often make.
Please transform the provided sentence in a similar manner by introducing errors:

Correct: The quick brown fox jumps over the lazy dog.
With errors: Teh quikc brwon fxo jumps oevr teh lazi doog.

Correct: I will meet you at the library at 3 PM today.
With errors: I wil met yu at teh libary at 3 PM tdoay.

Correct: Ensure all documents are organized and submitted by the deadline.
With errors: Esnure al documetns are orgnaized and sbmitted by teh deadline.

Now, introduce errors into the following sentence: """

prompt_length = len(tokenizer.encode(prompt))
# prompt_chars = len(prompt)


def combine_text_fields(row, fields):
    return ':'.join([row[field] for field in fields])

def introduce_errors_with_gpt2(input_csv_path, output_csv_path, fields):
    df = pd.read_csv(input_csv_path)
    total_rows = len(df)
    print(f"Total rows in file: {total_rows}")

    for index, row in df.iterrows():
        print(f"Processing row {index+1}/{total_rows}...")
        for field in fields:
            original_text = row[field]
            buffer = 0
            text_length = len(tokenizer.encode(original_text))
            max_length = min( (2 * text_length) + prompt_length + buffer, tokenizer.model_max_length)

            curr = prompt + original_text

            # Played with these parameters
            generated_text_with_errors = text_generator(curr,
                                            max_length=max_length,  # Consider reducing max_length
                                            temperature=1,  # Increase temperature for creativity
                                            top_k=40,  # Adjust for diversity
                                            top_p=0.9,  # Nucleus sampling for coherent yet diverse output
                                            num_return_sequences=1)[0]['generated_text'].strip()


            # tried to use gpt 3, not free

            # # Generate a completion using the OpenAI API
            # response = openai.Completion.create(
            #   model="gpt-3.5-turbo",
            #   prompt=curr_prompt,
            #   max_tokens=20  # Adjust the number of maximum tokens as needed
            # )


             # Remove prompt from output, currently not working, length not being calculated correctlty but simple problem to fix
            # generated_text_with_errors =  generated_text_with_errors[prompt lenght + text length:]

            df.at[index, field] = generated_text_with_errors


    # Apply the function to each row of the DataFrame to create the 'combined_text' column
    df[combined_text_column_name] = df.apply(lambda row: combine_text_fields(row, fields), axis=1)

    df.to_csv(output_csv_path, index=False)
    print(f"Finished processing {input_csv_path}. Saved to {output_csv_path}.")


# for debugging
input_csv_path = '/content/drive/My Drive/224_project/example.csv'
errors_csv_path = '/content/drive/My Drive/224_project/example_errors.csv'
introduce_errors_with_gpt2(input_csv_path, errors_csv_path, fields)

# test
# input_csv_path_test = '/content/drive/My Drive/224_project/sustainability_business_activities_test.csv'
# errors_csv_path_test = '/content/drive/My Drive/224_project/sustainability_business_activities_test_with_errors.csv'
# introduce_errors_with_gpt2(input_csv_path_test, errors_csv_path_test, fields)

# training
# input_csv_path_train = '/content/drive/My Drive/224_project/sustainability_business_activities_training.csv'
# errors_csv_path_train = '/content/drive/My Drive/224_project/sustainability_business_activities_training_with_errors.csv'
# introduce_errors_with_gpt2(input_csv_path_train, errors_csv_path_train, fields)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Total rows in file: 9
Processing row 1/9...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Processing row 2/9...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Processing row 3/9...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Processing row 4/9...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Processing row 5/9...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Processing row 6/9...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Processing row 7/9...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Processing row 8/9...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Processing row 9/9...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Finished processing /content/drive/My Drive/224_project/example.csv. Saved to /content/drive/My Drive/224_project/example_errors.csv.


In [16]:
# Model to classify how similar error files is to orginal file before errors.
# can compare good errors, bad errors, and original. a good score is a high score
# in similarity. it reflects how close the errors are to the original,
# which reflects human like errors that are similar to the original intented text
# I was not able to run this yet, never got to this part.

# !pip install sentence-transformers

from sentence_transformers import SentenceTransformer, util
import pandas as pd

from google.colab import drive



!ls -l "/content/drive/My Drive/224n_project/"



model = SentenceTransformer('all-MiniLM-L6-v2')

def score_text_similarity_and_save_summary(original_csv_path, errors_csv_path_gpt2, errors_csv_path_probability, output_csv_path, fields):
    original_df = pd.read_csv(original_csv_path)
    errors_df_gpt2 = pd.read_csv(errors_csv_path_gpt2)
    errors_df_probability = pd.read_csv(errors_csv_path_probability)

    results_df = original_df.copy().drop(fields, axis=1)  # Drop original fields to avoid confusion in the results

    total_scores_gpt2, total_scores_probability = [], []

    for index in range(len(original_df)):
        for field in fields:
            original_text = original_df.at[index, field]
            modified_text_gpt2 = errors_df_gpt2.at[index, field]
            modified_text_probability = errors_df_probability.at[index, field]

            original_vector = model.encode(original_text, convert_to_tensor=True)
            modified_vector_gpt2 = model.encode(modified_text_gpt2, convert_to_tensor=True)
            modified_vector_probability = model.encode(modified_text_probability, convert_to_tensor=True)

            similarity_gpt2 = util.pytorch_cos_sim(original_vector, modified_vector_gpt2)
            similarity_probability = util.pytorch_cos_sim(original_vector, modified_vector_probability)

            total_scores_gpt2.append(similarity_gpt2.item())
            total_scores_probability.append(similarity_probability.item())

            results_df.at[index, f'{field}_GPT2_SimilarityScore'] = similarity_gpt2.item()
            results_df.at[index, f'{field}_Probability_SimilarityScore'] = similarity_probability.item()

    # Save detailed similarity scores
    results_df.to_csv(output_csv_path, index=False)

    # Calculate and save summary statistics
    total_gpt2 = sum(total_scores_gpt2)
    average_gpt2 = total_gpt2 / len(total_scores_gpt2)
    total_probability = sum(total_scores_probability)
    average_probability = total_probability / len(total_scores_probability)

    summary_data = {
        'Total_GPT2_SimilarityScore': total_gpt2,
        'Average_GPT2_SimilarityScore': average_gpt2,
        'Total_Probability_SimilarityScore': total_probability,
        'Average_Probability_SimilarityScore': average_probability
    }

    summary_df = pd.DataFrame([summary_data])
    summary_path = output_csv_path.replace('.csv', '_summary.csv')
    summary_df.to_csv(summary_path, index=False)
    print(f"Summary statistics saved to {summary_path}")


fields = ['Business Activity Description', 'Business Activity Vendor', 'Business Activity Comment']

original_csv_path_test = '/content/drive/My Drive/224n_project/sustainability_business_activities_test.csv'
errors_csv_path_gpt2_test = '/content/drive/My Drive/224n_project/sustainability_business_activities_test_with_errors.csv'
errors_csv_path_probability_test = '/content/drive/My Drive/224n_project/sustainability_business_activities_test_with_bad_errors.csv'
output_csv_path_test = '/content/drive/My Drive/224n_project/similarity_scores_test.csv'

score_text_similarity_and_save_summary(original_csv_path_test, errors_csv_path_gpt2_test, errors_csv_path_probability_test, output_csv_path_test, fields)

# original_csv_path_training = '/content/drive/My Drive/224n_project/sustainability_business_activities_training.csv'
# errors_csv_path_probability_training = '/content/drive/My Drive/224n_project/sustainability_business_activities_training_with_bad_errors.csv'
# errors_csv_path_gpt2_training = '/content/drive/My Drive/224n_project/sustainability_business_activities_training_with_errors.csv'
# output_csv_path_training = '/content/drive/My Drive/224n_project/similarity_scores_test.csv'
# score_text_similarity_and_save_summary(original_csv_path_training, errors_csv_path_gpt2_training, errors_csv_path_probability_training, output_csv_path_training, fields)

total 18869
-rw------- 1 root root   59961 Mar 17 02:22  accuracy_vs_error_rate_20240317.png
-rw------- 1 root root  121889 Mar 12 20:56  a.conll
-rw------- 1 root root  235519 Mar 12 20:36  b.conll
-rw------- 1 root root  758474 Mar 12 21:42  cached_lm_GPT2Tokenizer_128_collected_tweets.txt
-rw------- 1 root root       0 Mar 13 19:12  cached_lm_GPT2Tokenizer_128_collected_tweets.txt.lock
-rw------- 1 root root   34631 Mar 12 20:36  c.conll
-rw------- 1 root root  875834 Mar 12 21:27  collected_tweets.txt
-rw------- 1 root root  320339 Mar 12 20:36  d.conll
-rw------- 1 root root  294166 Mar 12 20:36  e.conll
-rw------- 1 root root  123247 Mar 12 04:06  EPA_EmissionData.csv
-rw------- 1 root root    1512 Mar 13 05:00  example.csv
-rw------- 1 root root   38804 Mar 13 20:27  example_errors.csv
-rw------- 1 root root  235158 Mar 12 20:36  f.conll
drwx------ 2 root root    4096 Mar 12 22:25  gpt2_finetuned
-rw------- 1 root root 2858223 Mar 17 01:31  movies.csv
-rw------- 1 root root  123

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/224n_project/sustainability_business_activities_training_with_errors.csv'

In [None]:
import numpy as np
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Split data into features and labels
# activity_df['combined_text'] = activity_df['Business Activity Description'] + " " + activity_df['Vendor'] + " " + activity_df['Comment']
X = activity_df[combined_text_column_name]  # Feature
y = activity_df[encoded_label_column_name]  # Assuming 'label' is already encoded as numeric labels