In [33]:
import pandas as pd
import re

def extract_experience(text):
    # Mapping words to numbers for 0-20 and some common multiples of ten
    words_to_numbers = {
        'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7,
        'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen':14,
        'fifteen': 15, 'sixteen': 16, 'seventeen': 17, 'eighteen': 18, 'nineteen': 19, 'twenty': 20,
        'thirty': 30, 'forty': 40, 'fifty': 50, 'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90
    }

    # Helper function to convert word to number
    def word_to_num(word):
        return words_to_numbers.get(word.lower(), None)

    # Define regex patterns for various ways of expressing years of experience
    number_or_word = r'\b(\d+|' + '|'.join(words_to_numbers.keys()) + r')\b'

    patterns = [
        r'\b(\d+|' + '|'.join(words_to_numbers.keys()) + r')\s*(?:years?|year)\s+of\s+(?:relevant\s+)?(?:work\s+)?experience\b',
        r"\b(\d+|" + "|".join(words_to_numbers.keys()) + r")\s*(?:years?|year)\s+of\s+(?:relevant\s+)?(?:work\s+)?experience\b",
        r"\b(\d+|" + "|".join(words_to_numbers.keys()) + r")\s*(?:year|years)\s+experience\s+required\b",
        r"\b(\d+|" + "|".join(words_to_numbers.keys()) + r")\s*(?:year|years)\s+minimum\s+experience\b",
        r"\b(?:at\s+least|minimum)\s+(\d+|" + "|".join(words_to_numbers.keys()) + r")\s*(?:year|years)\s+of\s+(?:relevant\s+)?(?:work\s+)?experience\b",
        #r"\b(?:more\s+than|\over)\s+(\d+|" + "|".join(words_to_numbers.keys()) + r")\s*(?:year|years)\s+of\s+(?:relevant\s+)?(?:work\s+)?experience\b",
        r"\b(\d+|" + "|".join(words_to_numbers.keys()) + r")\s*(?:year|years)\s+of\s+professional\s+experience\b",
        r"\b(?:experience\s+of\s+(\d+|" + "|".join(words_to_numbers.keys()) + r")\s*(?:year|years))\b",  # Added a new pattern
        r"\b(?:\d+|" + "|".join(words_to_numbers.keys()) + r")\s*(?:year|years)\s+required\b",  # Added a new pattern
    ]

    # Combine patterns into one regex
    combined_pattern = '|'.join(f"({pattern})" for pattern in patterns)

    # Compile the regex
    regex = re.compile(combined_pattern, re.IGNORECASE)

    # Find all matches
    matches = regex.findall(text)

    # Extract the years from matches
    experience_years = -1
    for match in matches:
        for item in match:
            if item.isdigit() and (experience_years == -1 or int(item) < experience_years):
                experience_years = int(item)
            else:
                num = word_to_num(item)
                if num is not None and (experience_years == -1 or num < experience_years):
                    experience_years = num

    return experience_years

# Load data and preprocess
df = pd.read_excel("SpaceIndividuals_GNSS.xlsx")
columns = ["Título", "OfferDescription", "Requirements", "Responsibilities"]
descriptions = df[columns].copy().apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).tolist()
input = [desc for desc in descriptions]

# Extract experience and annotate DataFrame
requiredYears = []
classified = 0
for text in input:
    textExperience = extract_experience(text)
    if textExperience != -1:
        classified += 1
    requiredYears.append(textExperience)
df["RequiredExperience"] = requiredYears
df.to_excel("SpaceIndividuals_GNSS.xlsx")

# Calculate and print classification statistics
percentage = (classified / len(requiredYears)) * 100
print(f"Classified: {percentage:.2f}%")
print(f"Undefined: {(100 - percentage):.2f}%")

print("\n ----------------------------- \n")
print("Document Annotated")

Classified: 7.41%
Undefined: 92.59%

 ----------------------------- 

Document Annotated
