In [1]:

import datasets
import json
import os
import pandas as pd

logger = datasets.logging.get_logger(__name__)

_CITATION = """@inproceedings{loukas-etal-2022-finer,
    title = "{FiNER: Financial Numeric Entity Recognition for XBRL Tagging}",
    author = "Loukas, Lefteris  and
      Fergadiotis, Manos  and
      Chalkidis, Ilias and
      Spyropoulou, Eirini and
      Malakasiotis, Prodromos  and
      Androutsopoulos, Ion and
      Paliouras George",
    booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics",
    month = "may",
    year = "2022",
    publisher = "Association for Computational Linguistics",
}"""

_DESCRIPTION = """
FiNER-139 is a named entity recognition dataset consisting of 10K annual 
and quarterly English reports (filings) of publicly traded companies 
downloaded from the U.S. Securities and Exchange Commission (SEC) 
annotated with 139 XBRL tags in the IOB2 format.
"""

_DATA_URL = "finer139.zip"

_HOMEPAGE = "http://nlp.cs.aueb.gr/"

_VERSION = "1.0.0"

_LABELS = [
    "O",
    "B-AccrualForEnvironmentalLossContingencies",
    "B-AcquiredFiniteLivedIntangibleAssetsWeightedAverageUsefulLife",
    "I-AcquiredFiniteLivedIntangibleAssetsWeightedAverageUsefulLife",
    "B-AllocatedShareBasedCompensationExpense",
    "B-AmortizationOfFinancingCosts",
    "B-AmortizationOfIntangibleAssets",
    "I-AmortizationOfIntangibleAssets",
    "B-AntidilutiveSecuritiesExcludedFromComputationOfEarningsPerShareAmount",
    "I-AntidilutiveSecuritiesExcludedFromComputationOfEarningsPerShareAmount",
    "B-AreaOfRealEstateProperty",
    "I-AreaOfRealEstateProperty",
    "B-AssetImpairmentCharges",
    "B-BusinessAcquisitionEquityInterestsIssuedOrIssuableNumberOfSharesIssued",
    "B-BusinessAcquisitionPercentageOfVotingInterestsAcquired",
    "I-BusinessAcquisitionPercentageOfVotingInterestsAcquired",
    "B-BusinessCombinationAcquisitionRelatedCosts",
    "B-BusinessCombinationConsiderationTransferred1",
    "B-BusinessCombinationContingentConsiderationLiability",
    "B-BusinessCombinationRecognizedIdentifiableAssetsAcquiredAndLiabilitiesAssumedIntangibleAssetsOtherThanGoodwill",
    "B-BusinessCombinationRecognizedIdentifiableAssetsAcquiredAndLiabilitiesAssumedIntangibles",
    "B-CapitalizedContractCostAmortization",
    "B-CashAndCashEquivalentsFairValueDisclosure",
    "B-ClassOfWarrantOrRightExercisePriceOfWarrantsOrRights1",
    "B-CommonStockCapitalSharesReservedForFutureIssuance",
    "B-CommonStockDividendsPerShareDeclared",
    "B-CommonStockParOrStatedValuePerShare",
    "B-CommonStockSharesAuthorized",
    "I-CommonStockSharesAuthorized",
    "B-CommonStockSharesOutstanding",
    "B-ConcentrationRiskPercentage1",
    "B-ContractWithCustomerLiability",
    "B-ContractWithCustomerLiabilityRevenueRecognized",
    "B-CumulativeEffectOfNewAccountingPrincipleInPeriodOfAdoption",
    "B-DebtInstrumentBasisSpreadOnVariableRate1",
    "B-DebtInstrumentCarryingAmount",
    "B-DebtInstrumentConvertibleConversionPrice1",
    "B-DebtInstrumentFaceAmount",
    "I-DebtInstrumentFaceAmount",
    "B-DebtInstrumentFairValue",
    "B-DebtInstrumentInterestRateEffectivePercentage",
    "B-DebtInstrumentInterestRateStatedPercentage",
    "B-DebtInstrumentMaturityDate",
    "I-DebtInstrumentMaturityDate",
    "B-DebtInstrumentRedemptionPricePercentage",
    "B-DebtInstrumentTerm",
    "I-DebtInstrumentTerm",
    "B-DebtInstrumentUnamortizedDiscount",
    "B-DebtWeightedAverageInterestRate",
    "B-DeferredFinanceCostsGross",
    "B-DeferredFinanceCostsNet",
    "B-DefinedBenefitPlanContributionsByEmployer",
    "B-DefinedContributionPlanCostRecognized",
    "B-Depreciation",
    "B-DerivativeFixedInterestRate",
    "B-DerivativeNotionalAmount",
    "B-DisposalGroupIncludingDiscontinuedOperationConsideration",
    "B-EffectiveIncomeTaxRateContinuingOperations",
    "B-EffectiveIncomeTaxRateReconciliationAtFederalStatutoryIncomeTaxRate",
    "B-EmployeeServiceShareBasedCompensationNonvestedAwardsTotalCompensationCostNotYetRecognized",
    "B-EmployeeServiceShareBasedCompensationNonvestedAwardsTotalCompensationCostNotYetRecognizedPeriodForRecognition1",
    "I-EmployeeServiceShareBasedCompensationNonvestedAwardsTotalCompensationCostNotYetRecognizedPeriodForRecognition1",
    "B-EmployeeServiceShareBasedCompensationNonvestedAwardsTotalCompensationCostNotYetRecognizedShareBasedAwardsOtherThanOptions",
    "B-EmployeeServiceShareBasedCompensationTaxBenefitFromCompensationExpense",
    "B-EquityMethodInvestmentOwnershipPercentage",
    "I-EquityMethodInvestmentOwnershipPercentage",
    "B-EquityMethodInvestments",
    "B-FiniteLivedIntangibleAssetUsefulLife",
    "I-FiniteLivedIntangibleAssetUsefulLife",
    "B-GainsLossesOnExtinguishmentOfDebt",
    "B-Goodwill",
    "B-GoodwillImpairmentLoss",
    "B-GuaranteeObligationsMaximumExposure",
    "B-IncomeLossFromEquityMethodInvestments",
    "B-IncomeTaxExpenseBenefit",
    "B-InterestExpense",
    "B-InterestExpenseDebt",
    "B-LeaseAndRentalExpense",
    "B-LesseeOperatingLeaseRenewalTerm",
    "I-LesseeOperatingLeaseRenewalTerm",
    "B-LesseeOperatingLeaseTermOfContract",
    "I-LesseeOperatingLeaseTermOfContract",
    "B-LettersOfCreditOutstandingAmount",
    "B-LineOfCredit",
    "B-LineOfCreditFacilityCommitmentFeePercentage",
    "B-LineOfCreditFacilityCurrentBorrowingCapacity",
    "B-LineOfCreditFacilityInterestRateAtPeriodEnd",
    "B-LineOfCreditFacilityMaximumBorrowingCapacity",
    "B-LineOfCreditFacilityRemainingBorrowingCapacity",
    "B-LineOfCreditFacilityUnusedCapacityCommitmentFeePercentage",
    "B-LongTermDebt",
    "B-LongTermDebtFairValue",
    "B-LossContingencyAccrualAtCarryingValue",
    "B-LossContingencyDamagesSoughtValue",
    "B-LossContingencyEstimateOfPossibleLoss",
    "B-LossContingencyPendingClaimsNumber",
    "I-LossContingencyPendingClaimsNumber",
    "B-MinorityInterestOwnershipPercentageByNoncontrollingOwners",
    "B-MinorityInterestOwnershipPercentageByParent",
    "B-NumberOfOperatingSegments",
    "B-NumberOfRealEstateProperties",
    "I-NumberOfRealEstateProperties",
    "B-NumberOfReportableSegments",
    "B-OperatingLeaseCost",
    "B-OperatingLeaseExpense",
    "B-OperatingLeaseLiability",
    "B-OperatingLeasePayments",
    "B-OperatingLeaseRightOfUseAsset",
    "B-OperatingLeaseWeightedAverageDiscountRatePercent",
    "B-OperatingLeaseWeightedAverageRemainingLeaseTerm1",
    "I-OperatingLeaseWeightedAverageRemainingLeaseTerm1",
    "B-OperatingLeasesRentExpenseNet",
    "B-OperatingLossCarryforwards",
    "B-PaymentsToAcquireBusinessesGross",
    "B-PaymentsToAcquireBusinessesNetOfCashAcquired",
    "B-PreferredStockDividendRatePercentage",
    "B-PreferredStockSharesAuthorized",
    "I-PreferredStockSharesAuthorized",
    "B-ProceedsFromIssuanceOfCommonStock",
    "B-PropertyPlantAndEquipmentUsefulLife",
    "I-PropertyPlantAndEquipmentUsefulLife",
    "B-PublicUtilitiesRequestedRateIncreaseDecreaseAmount",
    "B-RelatedPartyTransactionAmountsOfTransaction",
    "I-RelatedPartyTransactionAmountsOfTransaction",
    "B-RelatedPartyTransactionExpensesFromTransactionsWithRelatedParty",
    "I-RelatedPartyTransactionExpensesFromTransactionsWithRelatedParty",
    "B-RepaymentsOfDebt",
    "B-RestructuringAndRelatedCostExpectedCost1",
    "B-RestructuringCharges",
    "B-RevenueFromContractWithCustomerExcludingAssessedTax",
    "B-RevenueFromContractWithCustomerIncludingAssessedTax",
    "B-RevenueFromRelatedParties",
    "B-RevenueRemainingPerformanceObligation",
    "B-Revenues",
    "B-SaleOfStockNumberOfSharesIssuedInTransaction",
    "I-SaleOfStockNumberOfSharesIssuedInTransaction",
    "B-SaleOfStockPricePerShare",
    "B-ShareBasedCompensation",
    "B-ShareBasedCompensationArrangementByShareBasedPaymentAwardAwardVestingPeriod1",
    "I-ShareBasedCompensationArrangementByShareBasedPaymentAwardAwardVestingPeriod1",
    "B-ShareBasedCompensationArrangementByShareBasedPaymentAwardEquityInstrumentsOtherThanOptionsGrantsInPeriod",
    "I-ShareBasedCompensationArrangementByShareBasedPaymentAwardEquityInstrumentsOtherThanOptionsGrantsInPeriod",
    "B-ShareBasedCompensationArrangementByShareBasedPaymentAwardEquityInstrumentsOtherThanOptionsGrantsInPeriodWeightedAverageGrantDateFairValue",
    "B-ShareBasedCompensationArrangementByShareBasedPaymentAwardEquityInstrumentsOtherThanOptionsNonvestedNumber",
    "B-ShareBasedCompensationArrangementByShareBasedPaymentAwardEquityInstrumentsOtherThanOptionsVestedInPeriodTotalFairValue",
    "B-ShareBasedCompensationArrangementByShareBasedPaymentAwardNumberOfSharesAuthorized",
    "I-ShareBasedCompensationArrangementByShareBasedPaymentAwardNumberOfSharesAuthorized",
    "B-ShareBasedCompensationArrangementByShareBasedPaymentAwardNumberOfSharesAvailableForGrant",
    "B-ShareBasedCompensationArrangementByShareBasedPaymentAwardOptionsExercisesInPeriodTotalIntrinsicValue",
    "B-ShareBasedCompensationArrangementByShareBasedPaymentAwardOptionsGrantsInPeriodGross",
    "B-ShareBasedCompensationArrangementByShareBasedPaymentAwardOptionsGrantsInPeriodWeightedAverageGrantDateFairValue",
    "B-SharePrice",
    "B-SharebasedCompensationArrangementBySharebasedPaymentAwardAwardVestingRightsPercentage",
    "I-SharebasedCompensationArrangementBySharebasedPaymentAwardAwardVestingRightsPercentage",
    "B-SharebasedCompensationArrangementBySharebasedPaymentAwardExpirationPeriod",
    "I-SharebasedCompensationArrangementBySharebasedPaymentAwardExpirationPeriod",
    "B-StockIssuedDuringPeriodSharesNewIssues",
    "I-StockIssuedDuringPeriodSharesNewIssues",
    "B-StockRepurchaseProgramAuthorizedAmount1",
    "B-StockRepurchaseProgramRemainingAuthorizedRepurchaseAmount1",
    "B-StockRepurchasedAndRetiredDuringPeriodShares",
    "B-StockRepurchasedDuringPeriodShares",
    "I-StockRepurchasedDuringPeriodShares",
    "B-SupplementalInformationForPropertyCasualtyInsuranceUnderwritersPriorYearClaimsAndClaimsAdjustmentExpense",
    "B-TreasuryStockAcquiredAverageCostPerShare",
    "B-TreasuryStockSharesAcquired",
    "I-TreasuryStockSharesAcquired",
    "B-TreasuryStockValueAcquiredCostMethod",
    "B-UnrecognizedTaxBenefits",
    "B-UnrecognizedTaxBenefitsThatWouldImpactEffectiveTaxRate",
    "I-DeferredFinanceCostsGross",
    "I-CommonStockParOrStatedValuePerShare",
    "I-LossContingencyEstimateOfPossibleLoss",
    "I-DefinedContributionPlanCostRecognized",
    "I-DebtInstrumentFairValue",
    "I-ContractWithCustomerLiabilityRevenueRecognized",
    "I-RevenueRemainingPerformanceObligation",
    "I-EmployeeServiceShareBasedCompensationNonvestedAwardsTotalCompensationCostNotYetRecognized",
    "I-DebtInstrumentInterestRateStatedPercentage",
    "I-OperatingLossCarryforwards",
    "I-MinorityInterestOwnershipPercentageByNoncontrollingOwners",
    "I-InterestExpense",
    "I-LongTermDebt",
    "I-ShareBasedCompensation",
    "I-DebtWeightedAverageInterestRate",
    "I-DebtInstrumentCarryingAmount",
    "I-DebtInstrumentConvertibleConversionPrice1",
    "I-IncomeTaxExpenseBenefit",
    "I-ShareBasedCompensationArrangementByShareBasedPaymentAwardOptionsGrantsInPeriodWeightedAverageGrantDateFairValue",
    "I-EmployeeServiceShareBasedCompensationNonvestedAwardsTotalCompensationCostNotYetRecognizedShareBasedAwardsOtherThanOptions",
    "I-EquityMethodInvestments",
    "I-DebtInstrumentUnamortizedDiscount",
    "I-GainsLossesOnExtinguishmentOfDebt",
    "I-ShareBasedCompensationArrangementByShareBasedPaymentAwardNumberOfSharesAvailableForGrant",
    "I-BusinessCombinationRecognizedIdentifiableAssetsAcquiredAndLiabilitiesAssumedIntangibleAssetsOtherThanGoodwill",
    "I-PreferredStockDividendRatePercentage",
    "I-RevenueFromContractWithCustomerIncludingAssessedTax",
    "I-OperatingLeaseWeightedAverageDiscountRatePercent",
    "I-LineOfCredit",
    "I-LineOfCreditFacilityMaximumBorrowingCapacity",
    "I-EffectiveIncomeTaxRateReconciliationAtFederalStatutoryIncomeTaxRate",
    "I-LineOfCreditFacilityCommitmentFeePercentage",
    "I-BusinessCombinationConsiderationTransferred1",
    "I-CommonStockDividendsPerShareDeclared",
    "I-DebtInstrumentBasisSpreadOnVariableRate1",
    "I-DisposalGroupIncludingDiscontinuedOperationConsideration",
    "I-ShareBasedCompensationArrangementByShareBasedPaymentAwardOptionsGrantsInPeriodGross",
    "I-CommonStockSharesOutstanding",
    "I-AmortizationOfFinancingCosts",
    "I-LineOfCreditFacilityCurrentBorrowingCapacity",
    "I-TreasuryStockValueAcquiredCostMethod",
    "I-ShareBasedCompensationArrangementByShareBasedPaymentAwardEquityInstrumentsOtherThanOptionsNonvestedNumber",
    "I-DebtInstrumentInterestRateEffectivePercentage",
    "I-SaleOfStockPricePerShare",
    "I-CapitalizedContractCostAmortization",
    "I-RestructuringCharges",
    "I-ShareBasedCompensationArrangementByShareBasedPaymentAwardEquityInstrumentsOtherThanOptionsVestedInPeriodTotalFairValue",
    "I-AccrualForEnvironmentalLossContingencies",
    "I-CashAndCashEquivalentsFairValueDisclosure",
    "I-ProceedsFromIssuanceOfCommonStock",
    "I-Revenues",
    "I-BusinessCombinationRecognizedIdentifiableAssetsAcquiredAndLiabilitiesAssumedIntangibles",
    "I-LettersOfCreditOutstandingAmount",
    "I-ShareBasedCompensationArrangementByShareBasedPaymentAwardEquityInstrumentsOtherThanOptionsGrantsInPeriodWeightedAverageGrantDateFairValue",
    "I-OperatingLeasePayments",
    "I-LineOfCreditFacilityRemainingBorrowingCapacity",
    "I-PaymentsToAcquireBusinessesGross",
    "I-TreasuryStockAcquiredAverageCostPerShare",
    "I-DeferredFinanceCostsNet",
    "I-StockRepurchaseProgramAuthorizedAmount1",
    "I-InterestExpenseDebt",
    "I-ContractWithCustomerLiability",
    "I-OperatingLeaseExpense",
    "I-Depreciation",
    "I-AllocatedShareBasedCompensationExpense",
    "I-LossContingencyAccrualAtCarryingValue",
    "I-LineOfCreditFacilityUnusedCapacityCommitmentFeePercentage",
    "I-SupplementalInformationForPropertyCasualtyInsuranceUnderwritersPriorYearClaimsAndClaimsAdjustmentExpense",
    "I-OperatingLeaseLiability",
    "I-RevenueFromRelatedParties",
    "I-PaymentsToAcquireBusinessesNetOfCashAcquired",
    "I-BusinessCombinationContingentConsiderationLiability",
    "I-LossContingencyDamagesSoughtValue",
    "I-NumberOfOperatingSegments",
    "I-BusinessAcquisitionEquityInterestsIssuedOrIssuableNumberOfSharesIssued",
    "I-OperatingLeaseRightOfUseAsset",
    "I-BusinessCombinationAcquisitionRelatedCosts",
    "I-UnrecognizedTaxBenefits",
    "I-GuaranteeObligationsMaximumExposure",
    "I-RestructuringAndRelatedCostExpectedCost1",
    "I-DefinedBenefitPlanContributionsByEmployer",
    "I-OperatingLeaseCost",
    "I-DerivativeFixedInterestRate",
    "I-Goodwill",
    "I-GoodwillImpairmentLoss",
    "I-CommonStockCapitalSharesReservedForFutureIssuance",
    "I-StockRepurchasedAndRetiredDuringPeriodShares",
    "I-EmployeeServiceShareBasedCompensationTaxBenefitFromCompensationExpense",
    "I-IncomeLossFromEquityMethodInvestments",
    "I-NumberOfReportableSegments",
    "I-LongTermDebtFairValue",
    "I-RepaymentsOfDebt",
    "I-ConcentrationRiskPercentage1",
    "I-DebtInstrumentRedemptionPricePercentage",
    "I-CumulativeEffectOfNewAccountingPrincipleInPeriodOfAdoption",
    "I-SharePrice",
    "I-UnrecognizedTaxBenefitsThatWouldImpactEffectiveTaxRate",
    "I-ShareBasedCompensationArrangementByShareBasedPaymentAwardOptionsExercisesInPeriodTotalIntrinsicValue",
    "I-EffectiveIncomeTaxRateContinuingOperations",
    "I-RevenueFromContractWithCustomerExcludingAssessedTax",
    "I-StockRepurchaseProgramRemainingAuthorizedRepurchaseAmount1",
    "I-LineOfCreditFacilityInterestRateAtPeriodEnd",
    "I-ClassOfWarrantOrRightExercisePriceOfWarrantsOrRights1",
    "I-OperatingLeasesRentExpenseNet",
    "I-LeaseAndRentalExpense",
    "I-PublicUtilitiesRequestedRateIncreaseDecreaseAmount",
    "I-MinorityInterestOwnershipPercentageByParent",
    "I-AssetImpairmentCharges",
    "I-DerivativeNotionalAmount",
]


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_set = pd.read_json("assets/datasets/finer-139/train.jsonl", lines = True)

test_set = pd.read_json("assets/datasets/finer-139/test.jsonl", lines = True)     
val_set = pd.read_json("assets/datasets/finer-139/validation.jsonl", lines = True)


KeyboardInterrupt: 

In [12]:
train_set["label_split"] = "train"
test_set["label_split"] = "test"
val_set["label_split"] = "validation"

def get_XBRL_tags(ner_tags_list, tag_list=_LABELS):
    non_zero_tags = []
    index = 0
    tag_indices = []    
    for tag in ner_tags_list:
        if tag != 'O':
            if tag in tag_list:
                non_zero_tags.append(tag)
                tag_indices.append(index)

            else:
                logger.warning(f"Tag {tag} not found in tag list.")
        index += 1
        
    return non_zero_tags, tag_indices


train_set["tag_names"], train_set["tag_indices"] = zip(*train_set["ner_tags"].apply(get_XBRL_tags))
test_set["tag_names"], test_set["tag_indices"] = zip(*test_set["ner_tags"].apply(get_XBRL_tags))
val_set["tag_names"], val_set["tag_indices"] = zip(*val_set["ner_tags"].apply(get_XBRL_tags))

In [28]:
# Convert pandas DataFrames to HuggingFace Dataset and upload to Hub

print("\n" + "="*60)
print("Converting to HuggingFace Dataset Format")
print("="*60)

from datasets import Dataset, DatasetDict, Features, Sequence, Value

# Convert each pandas dataframe to HuggingFace Dataset
def df_to_hf_dataset(df):
    """Convert pandas DataFrame to HuggingFace Dataset"""
    data_dict = {
        "id": df["id"].tolist(),
        "tokens": df["tokens"].tolist(),
        "ner_tags": df["ner_tags"].tolist(),
        "tag_names": df["tag_names"].tolist(),
        "tag_indices": df["tag_indices"].tolist(),
        "label_split": df["label_split"].tolist()
    }
    return data_dict

# Create datasets
print("Converting train set...")
train_data = df_to_hf_dataset(train_set)
train_dataset = Dataset.from_dict(train_data)

print("Converting test set...")
test_data = df_to_hf_dataset(test_set)
test_dataset = Dataset.from_dict(test_data)

print("Converting validation set...")
val_data = df_to_hf_dataset(val_set)
val_dataset = Dataset.from_dict(val_data)

# Combine into DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": val_dataset
})

print(f"\n‚úì Dataset conversion complete!")
print(f"  Train: {len(train_dataset)} examples")
print(f"  Test: {len(test_dataset)} examples")
print(f"  Validation: {len(val_dataset)} examples")
print(f"\n‚úì Features: {dataset_dict['train'].column_names}")

# Display sample
print("\n" + "="*60)
print("Sample from dataset:")
print("="*60)
sample = train_dataset[0]
print(f"ID: {sample['id']}")
print(f"Tokens: {sample['tokens'][:15]}")
print(f"NER Tags: {sample['ner_tags'][:15]}")
print(f"Tag Names: {sample['tag_names']}")
print(f"Tag Indices: {sample['tag_indices']}")


Converting to HuggingFace Dataset Format
Converting train set...
Converting test set...
Converting validation set...

‚úì Dataset conversion complete!
  Train: 900384 examples
  Test: 108378 examples
  Validation: 112494 examples

‚úì Features: ['id', 'tokens', 'ner_tags', 'tag_names', 'tag_indices', 'label_split']

Sample from dataset:
ID: 0
Tokens: ['ITEM', '1', 'Financial', 'Statements', 'Lennar', 'Corporation', 'and', 'Subsidiaries', 'Condensed', 'Consolidated', 'Balance', 'Sheets', '(', 'Dollars', 'in']
NER Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Tag Names: ['B-EquityMethodInvestments', 'B-EquityMethodInvestments']
Tag Indices: [180, 293]


In [29]:
# Upload standardized dataset to HuggingFace Hub

print("\n" + "="*60)
print("Upload to HuggingFace Hub")
print("="*60)

print("""
STEP 1: Install and authenticate HuggingFace CLI
======================================================
# Install if not already installed
pip install huggingface-hub

# Authenticate with your HuggingFace token
huggingface-cli login
# or programmatically:
""")

from huggingface_hub import login

print("\nSTEP 2: Configure your dataset repo name")
print("=" * 60)
print("""
Choose a repo name format:
  - finer-139-standardized (recommended)
  - financial-ner-xbrl-139
  - finer139-with-tags

Update the repo_id below with your username:
  repo_id = "volavion/finer-139-std"
""")

# CONFIGURATION - Update these before running
REPO_ID = "volavion/finer-139-std"  # Replace with your actual repo
PRIVATE = False  # Set to True for private dataset

print(f"\nConfigured repository: {REPO_ID}")
print(f"Privacy: {'Private' if PRIVATE else 'Public'}")

print("\nSTEP 3: Push dataset to Hub")
print("=" * 60)

push_code = f'''
# Push to HuggingFace Hub
try:
    # Uncomment after setting REPO_ID with your username
    # login()  # Authenticate first
    
    dataset_dict.push_to_hub(
        repo_id="{REPO_ID}",
        private={PRIVATE},
        commit_message="Upload standardized FiNER-139 dataset with XBRL tags"
    )
    print("‚úì Dataset uploaded successfully!")
    print(f"‚úì View at: https://huggingface.co/datasets/{REPO_ID}")
    
except Exception as e:
    print(f"‚úó Upload failed: {{e}}")
    print("Please check:")
    print("  1. REPO_ID is set correctly (replace 'your-username')")
    print("  2. You are authenticated: huggingface-cli login")
    print("  3. Your token has write permissions")
'''

print(push_code)

print("\nSTEP 4: Verify upload")
print("=" * 60)
print("""
# Test loading your dataset
from datasets import load_dataset
dataset = load_dataset("your-username/finer-139-standardized")
print(dataset)
""")

print("\nDATASET STATISTICS:")
print("=" * 60)
print(f"  Total examples: {len(train_dataset) + len(test_dataset) + len(val_dataset)}")
print(f"  Train split: {len(train_dataset)} examples")
print(f"  Test split: {len(test_dataset)} examples")
print(f"  Validation split: {len(val_dataset)} examples")
print(f"  Features: {', '.join(dataset_dict['train'].column_names)}")
print(f"  XBRL tags: {len(_LABELS)} total tags")

print("\n‚úì Ready to upload! Execute the code above with your HuggingFace token.")


Upload to HuggingFace Hub

STEP 1: Install and authenticate HuggingFace CLI
# Install if not already installed
pip install huggingface-hub

# Authenticate with your HuggingFace token
huggingface-cli login
# or programmatically:


STEP 2: Configure your dataset repo name

Choose a repo name format:
  - finer-139-standardized (recommended)
  - financial-ner-xbrl-139
  - finer139-with-tags

Update the repo_id below with your username:
  repo_id = "volavion/finer-139-std"


Configured repository: volavion/finer-139-std
Privacy: Public

STEP 3: Push dataset to Hub

# Push to HuggingFace Hub
try:
    # Uncomment after setting REPO_ID with your username
    # login()  # Authenticate first

    dataset_dict.push_to_hub(
        repo_id="volavion/finer-139-std",
        private=False,
        commit_message="Upload standardized FiNER-139 dataset with XBRL tags"
    )
    print("‚úì Dataset uploaded successfully!")
    print(f"‚úì View at: https://huggingface.co/datasets/volavion/finer-139-std"

In [30]:
# Execute: Upload Dataset to HuggingFace Hub using Token
import os
from pathlib import Path

# Load ANTHROPIC_API_KEY from .env without printing it
env_path = Path.cwd().parent / ".env" if Path.cwd().name == "notebooks" else Path.cwd() / ".env"
if env_path.exists():
    for line in env_path.read_text().splitlines():
        line = line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, value = line.split("=", 1)
        key = key.strip()
        value = value.strip()
        if key and key not in os.environ:
            os.environ[key] = value

print("\n" + "="*70)
print("UPLOADING DATASET TO HUGGINGFACE HUB")
print("="*70)

from huggingface_hub import login
import getpass

hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN", "")  # Initialize token variable

# Configuration
REPO_ID = "Volavion/finer-139-std"
PRIVATE = False

print(f"\nüìä Dataset Details:")
print(f"  Repo ID: {REPO_ID}")
print(f"  Privacy: {'üîí Private' if PRIVATE else 'üåê Public'}")
print(f"  Train examples: {len(train_dataset)}")
print(f"  Test examples: {len(test_dataset)}")
print(f"  Validation examples: {len(val_dataset)}")
print(f"  Total: {len(train_dataset) + len(test_dataset) + len(val_dataset)} examples")

print(f"\nüîê Step 1: Authenticating with HuggingFace...")


try:
    # Push dataset to hub
    dataset_dict.push_to_hub(
        repo_id=REPO_ID,
        private=PRIVATE,
        commit_message="Upload standardized FiNER-139 dataset with XBRL tags\n\n- 139 XBRL financial tags\n- Train/Test/Validation splits\n- Token labels and tag names included",
        token=hf_token if hf_token.strip() else None
    )
    
    print(f"\n‚úÖ SUCCESS! Dataset uploaded to HuggingFace Hub")
    print(f"\nüîó Dataset URL: https://huggingface.co/datasets/{REPO_ID}")
    print(f"\nüìù Dataset card has been created. You can edit it on HuggingFace Hub.")
    
except Exception as e:
    print(f"\n‚ùå Upload failed with error:")
    print(f"  {type(e).__name__}: {e}")
    print(f"\nüí° Troubleshooting:")
    print(f"  1. Check your HuggingFace token is valid")
    print(f"  2. Ensure you have write permissions")
    print(f"  3. The repo_id '{REPO_ID}' must match your username")
    print(f"  4. Try: huggingface-cli login")

print("\n" + "="*70)


UPLOADING DATASET TO HUGGINGFACE HUB

üìä Dataset Details:
  Repo ID: Volavion/finer-139-std
  Privacy: üåê Public
  Train examples: 900384
  Test examples: 108378
  Validation examples: 112494
  Total: 1121256 examples

üîê Step 1: Authenticating with HuggingFace...


Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:02<00:00,  1.77ba/s]
Processing Files (1 / 1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 38.2MB / 38.2MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:02<00:00,  1.75ba/s]
Processing Files (1 / 1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 38.1MB / 38.1MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:07<00:00,  3.89s/ shards]
Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.84ba/s]
Processing Files (1 / 1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9.77MB / 9.77MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.48s/ shards]
Creating parquet from Arrow format: 100%|‚ñà‚


‚úÖ SUCCESS! Dataset uploaded to HuggingFace Hub

üîó Dataset URL: https://huggingface.co/datasets/Volavion/finer-139-std

üìù Dataset card has been created. You can edit it on HuggingFace Hub.



In [31]:
train_set_shortened = train_set[train_set["tag_names"].apply(lambda x: len(x) > 0)]
test_set_shortened = test_set[test_set["tag_names"].apply(lambda x: len(x) > 0)]
val_set_shortened = val_set[val_set["tag_names"].apply(lambda x: len(x) > 0)]

# Convert pandas DataFrames to HuggingFace Dataset and upload to Hub

print("\n" + "="*60)
print("Converting to HuggingFace Dataset Format")
print("="*60)

from datasets import Dataset, DatasetDict, Features, Sequence, Value

# Convert each pandas dataframe to HuggingFace Dataset
def df_to_hf_dataset(df):
    """Convert pandas DataFrame to HuggingFace Dataset"""
    data_dict = {
        "id": df["id"].tolist(),
        "tokens": df["tokens"].tolist(),
        "ner_tags": df["ner_tags"].tolist(),
        "tag_names": df["tag_names"].tolist(),
        "tag_indices": df["tag_indices"].tolist(),
        "label_split": df["label_split"].tolist()
    }
    return data_dict

# Create datasets
print("Converting train set...")
train_data = df_to_hf_dataset(train_set_shortened)
train_dataset = Dataset.from_dict(train_data)

print("Converting test set...")
test_data = df_to_hf_dataset(test_set_shortened)
test_dataset = Dataset.from_dict(test_data)

print("Converting validation set...")
val_data = df_to_hf_dataset(val_set_shortened)
val_dataset = Dataset.from_dict(val_data)

# Combine into DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": val_dataset
})


Converting to HuggingFace Dataset Format
Converting train set...
Converting test set...
Converting validation set...


In [None]:
# Upload standardized dataset to HuggingFace Hub
# CONFIGURATION - Update these before running
REPO_ID = "Volavion/finer-139-xbrl-nonempty"  # Replace with your actual repo
PRIVATE = False  # Set to True for private dataset

# Execute: Upload Dataset to HuggingFace Hub using Token
import os
from pathlib import Path

# Load ANTHROPIC_API_KEY from .env without printing it
env_path = Path.cwd().parent / ".env" if Path.cwd().name == "notebooks" else Path.cwd() / ".env"
if env_path.exists():
    for line in env_path.read_text().splitlines():
        line = line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, value = line.split("=", 1)
        key = key.strip()
        value = value.strip()
        if key and key not in os.environ:
            os.environ[key] = value

print("\n" + "="*70)
print("UPLOADING DATASET TO HUGGINGFACE HUB")
print("="*70)

from huggingface_hub import login
import getpass

hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN", "")  # Initialize token variable


print(f"\nüìä Dataset Details:")
print(f"  Repo ID: {REPO_ID}")
print(f"  Privacy: {'üîí Private' if PRIVATE else 'üåê Public'}")
print(f"  Train examples: {len(train_dataset)}")
print(f"  Test examples: {len(test_dataset)}")
print(f"  Validation examples: {len(val_dataset)}")
print(f"  Total: {len(train_dataset) + len(test_dataset) + len(val_dataset)} examples")

print(f"\nüîê Step 1: Authenticating with HuggingFace...")


try:
    # Push dataset to hub
    dataset_dict.push_to_hub(
        repo_id=REPO_ID,
        private=PRIVATE,
        commit_message="Upload standardized FiNER-139 dataset with XBRL tags\n\n- 139 XBRL financial tags\n- Train/Test/Validation splits\n- Token labels and tag names included",
        token=hf_token if hf_token.strip() else None
    )
    
    print(f"\n‚úÖ SUCCESS! Dataset uploaded to HuggingFace Hub")
    print(f"\nüîó Dataset URL: https://huggingface.co/datasets/{REPO_ID}")
    print(f"\nüìù Dataset card has been created. You can edit it on HuggingFace Hub.")
    
except Exception as e:
    print(f"\n‚ùå Upload failed with error:")
    print(f"  {type(e).__name__}: {e}")
    print(f"\nüí° Troubleshooting:")
    print(f"  1. Check your HuggingFace token is valid")
    print(f"  2. Ensure you have write permissions")
    print(f"  3. The repo_id '{REPO_ID}' must match your username")
    print(f"  4. Try: huggingface-cli login")

print("\n" + "="*70)



UPLOADING DATASET TO HUGGINGFACE HUB

üìä Dataset Details:
  Repo ID: Volavion/finer-139-xbrl-nonempty
  Privacy: üåê Public
  Train examples: 179195
  Test examples: 18789
  Validation examples: 21603
  Total: 219587 examples

üîê Step 1: Authenticating with HuggingFace...


Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:01<00:00,  1.86ba/s]
Processing Files (1 / 1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17.4MB / 17.4MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:02<00:00,  2.47s/ shards]
Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  6.24ba/s]
Processing Files (1 / 1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2.03MB / 2.03MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.43 shards/s]
Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  5.62ba/s]
Processing Files (1 / 1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2.33MB / 2.33MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|‚ñà‚ñà‚ñà‚


‚úÖ SUCCESS! Dataset uploaded to HuggingFace Hub

üîó Dataset URL: https://huggingface.co/datasets/Volavion/finer-139-xbrl-nonempty

üìù Dataset card has been created. You can edit it on HuggingFace Hub.

