In [11]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [12]:
import pandas as pd

activity_data = drive.CreateFile({'id': '1smUkjfTvvHmx2QiCfDImKOOtdCSb4dTk'})
activity_data.GetContentFile('business_activities_training_data.csv')
activity_df = pd.read_csv('business_activities_training_data.csv')
print(activity_df.head())

activity_test_data = drive.CreateFile({'id': '1Q6Sm-lxT-cpOJpFeLg_Leyj3w28Ff9GY'})
activity_test_data.GetContentFile('business_activities_test_data.csv')
test_df = pd.read_csv('business_activities_test_data.csv')


                       business_activity_description      Cost       Vendor  \
0           Activities related to other aquaculture.  23657.15   BioHarvest   
1  Activities related to fruit and tree nut combi...  15475.83     AgriCorp   
2           Activities related to sugarcane farming.  37953.61  FreshFields   
3               Activities related to sheep farming.  14592.37      CropGen   
4     Activities related to floriculture production.   1879.17     AgriCorp   

                                             Comment  \
0         Procured for other aquaculture operations.   
1  Procured for fruit and tree nut combination fa...   
2         Procured for sugarcane farming operations.   
3             Procured for sheep farming operations.   
4   Procured for floriculture production operations.   

                         2017 NAICS Title  
0                       Other Aquaculture  
1  Fruit and Tree Nut Combination Farming  
2                       Sugarcane Farming  
3           

In [13]:
import pandas as pd
import numpy as np
import random

# NAICS Titles
naics_titles = [
    "Soybean Farming", "Oilseed (except Soybean) Farming", "Dry Pea and Bean Farming", "Wheat Farming",
    "Corn Farming", "Rice Farming", "Oilseed and Grain Combination Farming", "All Other Grain Farming",
    "Potato Farming", "Other Vegetable (except Potato) and Melon Farming", "Orange Groves", "Citrus (except Orange) Groves",
    "Apple Orchards", "Grape Vineyards", "Strawberry Farming", "Berry (except Strawberry) Farming",
    "Tree Nut Farming", "Fruit and Tree Nut Combination Farming", "Other Noncitrus Fruit Farming",
    "Mushroom Production", "Other Food Crops Grown Under Cover", "Nursery and Tree Production",
    "Floriculture Production", "Tobacco Farming", "Cotton Farming", "Sugarcane Farming", "Hay Farming",
    "Sugar Beet Farming", "Peanut Farming", "All Other Miscellaneous Crop Farming",
    "Beef Cattle Ranching and Farming", "Cattle Feedlots", "Dairy Cattle and Milk Production",
    "Dual-Purpose Cattle Ranching and Farming", "Hog and Pig Farming", "Chicken Egg Production",
    "Broilers and Other Meat Type Chicken Production", "Turkey Production", "Poultry Hatcheries",
    "Other Poultry Production", "Sheep Farming", "Goat Farming", "Finfish Farming and Fish Hatcheries",
    "Shellfish Farming", "Other Aquaculture", "Apiculture", "Horses and Other Equine Production",
    "Fur-Bearing Animal and Rabbit Production", "All Other Animal Production"
]

# Sample vendors for simulation
vendors = ["AgriCorp", "BioHarvest", "CropGen", "EcoFarms", "FreshFields"]

# Function to generate a random cost
def generate_cost():
    return round(random.uniform(1000, 50000), 2)

# Function to generate a cohesive comment
def generate_comment(naics_title):
    return f"Procured for {naics_title.lower()} operations."

# Generate synthetic data
def generate_data(naics_titles, num_records=5000):
    data = []
    for _ in range(num_records):
        naics_title = random.choice(naics_titles)
        data.append({
            "business_activity_description": f"Activities related to {naics_title.lower()}.",
            "Cost": generate_cost(),
            "Vendor": random.choice(vendors),
            "Comment": generate_comment(naics_title),
            "2017 NAICS Title": naics_title
        })
    return pd.DataFrame(data)

# Generate the datasets
data = generate_data(naics_titles, 6000)

# Split into training and test sets
train_data = data.sample(frac=0.833, random_state=42)  # 5000 records for training
test_data = data.drop(train_data.index)  # Remaining 1000 records for testing

train_data.to_csv('business_activities_training_data_enhanced.csv', index=False)
test_data.to_csv('business_activities_test_data_enhanced.csv', index=False)


In [15]:
import pandas as pd
import numpy as np
import random

# Helper functions
def introduce_minor_errors(text):
    """Introduce minor spelling mistakes in the text."""
    if random.random() < 0.15:  # 15% chance
      errors_introduced = 0
      max_errors = random.randint(2, 3)  # Decide to introduce 2 or 3 minor errors

      while errors_introduced < max_errors and len(text) > 4:  # Ensure text is long enough to alter
          error_type = random.choice(['substitute', 'omit', 'swap'])
          error_index = random.randint(1, len(text) - 2)  # Avoid beginning and end of the text for simplicity

          if error_type == 'substitute':
              # Substitute a character with a nearby character (mimicking common typing errors)
              substitutions = {'a': 's', 's': 'a', 'd': 'f', 'i': 'o', 'o': 'p', 'e': 'r', 'r': 't'}
              if text[error_index] in substitutions:
                  text = text[:error_index] + substitutions[text[error_index]] + text[error_index + 1:]
                  errors_introduced += 1

          elif error_type == 'omit':
              # Omit a character
              text = text[:error_index] + text[error_index + 1:]
              errors_introduced += 1

          elif error_type == 'swap':
              # Swap two adjacent characters
              if error_index < len(text) - 1:  # Ensure there's a character to swap with
                  text = text[:error_index] + text[error_index + 1] + text[error_index] + text[error_index + 2:]
                  errors_introduced += 1
    return text

def introduce_major_errors(text):
    """Replace or scramble parts of the text to introduce major errors."""
    if random.random() < 0.15:  # 15% chance
        # Randomly choose between scrambling or inserting irrelevant text
        if random.random() < 0.5:
            return ''.join(random.sample(text, len(text)))
        else:
            return "Irrelevant text " + ''.join(random.sample(text, len(text)))
    return text

def generate_data(naics_titles, num_records=5000):
    data = []
    for _ in range(num_records):
        naics_title = random.choice(naics_titles)
        description = f"Activities related to {naics_title.lower()}."
        description_with_errors = description
        # Introduce errors
        if random.random() < 0.5:
          description_with_errors = introduce_minor_errors(description)
        else:
          description_with_errors = introduce_major_errors(description_with_errors)

        data.append({
            "business_activity_description": description_with_errors,
            "Cost": generate_cost(),
            "Vendor": random.choice(vendors),
            "Comment": generate_comment(naics_title),
            "2017 NAICS Title": naics_title
        })
    return pd.DataFrame(data)

# Generate the datasets with errors introduced
data_with_errors = generate_data(naics_titles, 6000)

# Split into training and test sets
train_data_with_errors = data_with_errors.sample(frac=0.833, random_state=42)  # 5000 records for training
test_data_with_errors = data_with_errors.drop(train_data_with_errors.index)  # Remaining 1000 records for testing

# Save to CSV files
train_data_with_errors.to_csv('business_activities_training_data_with_errors.csv', index=False)
test_data_with_errors.to_csv('business_activities_test_data_with_errors.csv', index=False)
