In [1]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [2]:
import pandas as pd

activity_data = drive.CreateFile({'id': '1smUkjfTvvHmx2QiCfDImKOOtdCSb4dTk'})
activity_data.GetContentFile('business_activities_training_data.csv')
activity_df = pd.read_csv('business_activities_training_data.csv')
print(activity_df.head())

activity_test_data = drive.CreateFile({'id': '1Q6Sm-lxT-cpOJpFeLg_Leyj3w28Ff9GY'})
activity_test_data.GetContentFile('business_activities_test_data.csv')
test_df = pd.read_csv('business_activities_test_data.csv')


                       business_activity_description      Cost       Vendor  \
0  Activities related to finfish farming and fish...  19824.14      CropGen   
1  Activities related to dairy cattle and milk pr...  36390.68      CropGen   
2               Activities related to orange groves.  18685.28   BioHarvest   
3  Activities related to finfish farming and fish...  23104.34      CropGen   
4           Activities related to shellfish farming.  18025.01  FreshFields   

                                             Comment  \
0  Procured for finfish farming and fish hatcheri...   
1  Procured for dairy cattle and milk production ...   
2             Procured for orange groves operations.   
3  Procured for finfish farming and fish hatcheri...   
4         Procured for shellfish farming operations.   

                      2017 NAICS Title  
0  Finfish Farming and Fish Hatcheries  
1     Dairy Cattle and Milk Production  
2                        Orange Groves  
3  Finfish Farming and F

In [5]:
import pandas as pd
import numpy as np
import random

# NAICS Titles
naics_titles = [
    "Soybean Farming", "Oilseed (except Soybean) Farming", "Dry Pea and Bean Farming", "Wheat Farming",
    "Corn Farming", "Rice Farming", "Oilseed and Grain Combination Farming", "All Other Grain Farming",
    "Potato Farming", "Other Vegetable (except Potato) and Melon Farming", "Orange Groves", "Citrus (except Orange) Groves",
    "Apple Orchards", "Grape Vineyards", "Strawberry Farming", "Berry (except Strawberry) Farming",
    "Tree Nut Farming", "Fruit and Tree Nut Combination Farming", "Other Noncitrus Fruit Farming",
    "Mushroom Production", "Other Food Crops Grown Under Cover", "Nursery and Tree Production",
    "Floriculture Production", "Tobacco Farming", "Cotton Farming", "Sugarcane Farming", "Hay Farming",
    "Sugar Beet Farming", "Peanut Farming", "All Other Miscellaneous Crop Farming",
    "Beef Cattle Ranching and Farming", "Cattle Feedlots", "Dairy Cattle and Milk Production",
    "Dual-Purpose Cattle Ranching and Farming", "Hog and Pig Farming", "Chicken Egg Production",
    "Broilers and Other Meat Type Chicken Production", "Turkey Production", "Poultry Hatcheries",
    "Other Poultry Production", "Sheep Farming", "Goat Farming", "Finfish Farming and Fish Hatcheries",
    "Shellfish Farming", "Other Aquaculture", "Apiculture", "Horses and Other Equine Production",
    "Fur-Bearing Animal and Rabbit Production", "All Other Animal Production"
]

# Sample vendors for simulation
vendors = ["AgriCorp", "BioHarvest", "CropGen", "EcoFarms", "FreshFields"]

# Function to generate a random cost
def generate_cost():
    return round(random.uniform(1000, 50000), 2)

# Function to generate a cohesive comment
def generate_comment(naics_title):
    return f"Procured for {naics_title.lower()} operations."

# Generate synthetic data
def generate_data(naics_titles, num_records=5000):
    data = []
    for _ in range(num_records):
        naics_title = random.choice(naics_titles)
        data.append({
            "business_activity_description": f"Activities related to {naics_title.lower()}.",
            "Cost": generate_cost(),
            "Vendor": random.choice(vendors),
            "Comment": generate_comment(naics_title),
            "2017 NAICS Title": naics_title
        })
    return pd.DataFrame(data)

# Generate the datasets
data = generate_data(naics_titles, 6000)

# Split into training and test sets
train_data = data.sample(frac=0.833, random_state=42)  # 5000 records for training
test_data = data.drop(train_data.index)  # Remaining 1000 records for testing

train_data.to_csv('business_activities_training_data_enhanced.csv', index=False)
test_data.to_csv('business_activities_test_data_enhanced.csv', index=False)
