# The core task seems to be creating a rule-based system for language processing, with a particular emphasis on:

1) Negation Detection


Identifying negative statements
Finding negated terms in text
Working specifically with Spanish or Catalan language contexts


2) Key Objectives


Develop a systematic way to parse and analyze text
Create explicit rules for understanding language patterns
Handle language-specific linguistic nuances
Identify specific types of words or phrases (like negations, uncertainties)


3) Specific Focus Areas


Medical text analysis (suggested by the NegEx reference)
Handling special language characteristics:

Accented characters
Grammatical agreements
Verb conjugations
Contractions




# The Main Challenge
Develop a rule-based system that can:


Tokenize text

Apply linguistic rules

Detect specific language patterns

Work accurately with Spanish or Catalan language structures


In [3]:
import re

def separate_clitics(text):
    # Regex pattern to identify and separate Catalan clitics (regex made by not me, so we might need fixin)
    clitic_pattern = r'(\w+)-(m\'|t\'|s\'|li|ho|ne)'

    # Separate verb and clitics
    separated_text = re.sub(clitic_pattern, r'\1 \2', text)

    return {
        'original': text,
        'separated': separated_text,
        'clitics_detected': re.findall(clitic_pattern, text)
    }

# Example usage
text = "Mira-m'ho, el llibre és interessant."
result = separate_clitics(text)
print(result)

{'original': "Mira-m'ho, el llibre és interessant.", 'separated': "Mira m'ho, el llibre és interessant.", 'clitics_detected': [('Mira', "m'")]}


In [4]:
class UncertaintyDetector:
    def __init__(self, language='catalan'):
        self.uncertainty_words = {
            'catalan': ['possiblement', 'potser', 'probablement'],
            'spanish': ['posiblemente', 'quizás', 'probablemente']
        }.get(language, [])

    def detect_uncertainty(self, text):
        # Tokenize text
        tokens = text.split()
        uncertain_phrases = []

        for i, token in enumerate(tokens):
            if token.lower() in self.uncertainty_words:
                # Capture context around uncertainty
                start = max(0, i - 2)
                end = min(len(tokens), i + 3)
                uncertain_phrases.append({
                    'uncertainty_marker': token,
                    'context': ' '.join(tokens[start:end])
                })

        return uncertain_phrases

# Example usage
detector = UncertaintyDetector(language='catalan')
text = "Possiblement la teoria necessita més investigació. Potser hi ha altres factors a considerar."
results = detector.detect_uncertainty(text)
print(results)

[{'uncertainty_marker': 'Possiblement', 'context': 'Possiblement la teoria'}, {'uncertainty_marker': 'Potser', 'context': 'més investigació. Potser hi ha'}]


In [5]:
class AgreementChecker:
    def __init__(self, language='spanish'):
        self.articles = {
            'masculine': ['el', 'un', 'este', 'ese'],
            'feminine': ['la', 'una', 'esta', 'esa']
        }

    def check_gender_agreement(self, sentence):
        tokens = sentence.split()
        agreements = []

        for i in range(len(tokens) - 2):
            # Check article and noun
            if tokens[i] in self.articles['masculine']:
                expected_gender = 'masculine'
            elif tokens[i] in self.articles['feminine']:
                expected_gender = 'feminine'
            else:
                continue

            # Simple gender detection (not comprehensive)
            if expected_gender == 'masculine' and tokens[i+1].endswith('o'):
                agreements.append({
                    'status': 'correct',
                    'article': tokens[i],
                    'noun': tokens[i+1]
                })
            elif expected_gender == 'feminine' and tokens[i+1].endswith('a'):
                agreements.append({
                    'status': 'correct',
                    'article': tokens[i],
                    'noun': tokens[i+1]
                })
            else:
                agreements.append({
                    'status': 'incorrect',
                    'article': tokens[i],
                    'noun': tokens[i+1]
                })

        return agreements

# Example usage
checker = AgreementChecker()
sentences = [
    "El coche rojo está estacionado.",
    "La casa grande está limpia."
]

for sentence in sentences:
    results = checker.check_gender_agreement(sentence)
    print(f"Sentence: {sentence}")
    print("Agreements:", results)

Sentence: El coche rojo está estacionado.
Agreements: []
Sentence: La casa grande está limpia.
Agreements: []


In [6]:
def check_accent_errors(text, language='spanish'):
    # Define correct accent patterns for the language (Idk nothing about this language part, accualy just asked chatgpt to make the rules, so we need to work on this)
    accent_rules = {
        'spanish': {
            'á': ['a'], 'é': ['e'], 'í': ['i'], 'ó': ['o'], 'ú': ['u']
        },
        'catalan': {
            'à': ['a'], 'è': ['e'], 'é': ['e'], 'í': ['i'],
            'ò': ['o'], 'ó': ['o'], 'ú': ['u']
        }
    }

    # Tokenize and check each word
    errors = []
    words = text.split()

    for word in words:
        for accented_char, base_chars in accent_rules.get(language, {}).items():
            if accented_char in word:
                # Check if the base character is correct
                base_correct = any(base_char in word.lower().replace(accented_char, '')
                                   for base_char in base_chars)

                if not base_correct:
                    errors.append({
                        'word': word,
                        'incorrect_accent': accented_char
                    })

    return errors

# Example usage
text = "Estó es una pruéba de acentos incorrectos."
results = check_accent_errors(text, language='spanish')
print(results)

[{'word': 'Estó', 'incorrect_accent': 'ó'}, {'word': 'pruéba', 'incorrect_accent': 'é'}]
