In [None]:
import nltk
from nltk.corpus import brown
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

nltk.download('brown')
nltk.download('wordnet')

In [2]:
lemmatizer = WordNetLemmatizer()
brown_to_custom = {
    'CS': 'CST',   # Potentially CST or CJT, to be determined by context
    'DT': 'DT',    # Determiner
    'WDT': 'WPR',  # Relative pronoun
    'QL': 'RB',    # Adverb
    'CS-HL': 'CST',
    'CS-NC': 'CST',
    'DT-HL': 'DT',
    'DT-NC': 'DT',
    'DT-TL': 'DT',
    'WPS': 'WPR',
    'WPS-HL': 'WPR',
    'WPS-NC': 'WPR',
    'WPS-TL': 'WPR'
}

def is_verb_tag(tag):
    return tag.startswith('V') or tag in ['BE', 'HV', 'DO']

def is_noun_tag(tag):
    return tag.startswith('N') or tag in ['PPS', 'PPO']

def prepare_brown_corpus_for_that(output_file):
    training_data = []
    category_counts = defaultdict(int)
    total_that_count = 0

    for sent in brown.tagged_sents():
        sentence_tokens = []
        contains_relevant_that = False
        for i, (word, tag) in enumerate(sent):
            lemma = lemmatizer.lemmatize(word.lower())
            custom_tag = brown_to_custom.get(tag, tag)
            
            if custom_tag == 'CST':  # This includes 'CS', 'CS-HL', 'CS-NC'
                prev_tag = sent[i-1][1] if i > 0 else ''
                if is_verb_tag(prev_tag):
                    custom_tag = 'CJT'
                # If it's not a verb, it remains 'CST'
            
            if word.lower() == 'that' and custom_tag in ['WPR', 'CST', 'CJT', 'DT', 'RB']:
                category_counts[custom_tag] += 1
                total_that_count += 1
                contains_relevant_that = True
            
            sentence_tokens.append(f"{word}\t{custom_tag}\t{lemma}")
        
        if contains_relevant_that:
            training_data.extend(sentence_tokens)
            training_data.append("")

    with open(output_file, 'w', encoding='utf-8') as outfile:
        outfile.write("\n".join(training_data))

    print(f"Corpus d'entraînement sauvegardé dans : {output_file}")
    print(f"Nombre total d'occurrences de 'that': {total_that_count}")
    print("Nombre d'occurrences de 'that' pour chaque catégorie :")
    for category, count in category_counts.items():
        print(f"{category}: {count}")

output_file = "corpus_brown_that_custom.txt"
prepare_brown_corpus_for_that(output_file)

Corpus d'entraînement sauvegardé dans : corpus_brown_that_custom.txt
Nombre total d'occurrences de 'that': 10457
Nombre d'occurrences de 'that' pour chaque catégorie :
CST: 3831
CJT: 2636
WPR: 1662
DT: 2272
RB: 56
