In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from tabulate import tabulate

# Function to read word pairs from a text file
def read_word_pairs(file_path):
    word_pairs = []
    with open(file_path, 'r') as file:
        for line in file:
            singular, plural = line.strip().split(',')
            word_pairs.append((singular, plural))
    return word_pairs

# Define function to prepare and train the model
def train_pluralisation_model(word_pairs):
    df = pd.DataFrame(word_pairs, columns=['singular', 'plural'])

    # Extracting features for more precise pattern learning
    def extract_features(word):
        return {
            'last_letter': word[-1],
            'last_two_letters': word[-2:],
            'last_three_letters': word[-3:],
            'length': len(word)
        }

    df['features'] = df['singular'].apply(extract_features)
    df['suffix'] = df.apply(lambda row: row['plural'][len(row['singular']):], axis=1)

    X = df['features'].tolist()
    y = df['suffix']

    vectoriser = DictVectorizer(sparse=False)
    classifier = RandomForestClassifier(n_estimators=1000, random_state=23) # a bit of brute force with 1000 est.

    pipeline = Pipeline([
        ('vectoriser', vectoriser),
        ('classifier', classifier)
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)
    pipeline.fit(X_train, y_train)
    
    return pipeline

# Function to predict plural forms
def predict_plural(model, singular_word):
    def extract_features(word):
        return {
            'last_letter': word[-1],
            'last_two_letters': word[-2:],
            'last_three_letters': word[-3:],
            'length': len(word)
        }

    features = extract_features(singular_word)
    predicted_suffix = model.predict([features])[0]
    return singular_word + predicted_suffix

# Read the word pairs from the text file
word_pairs = read_word_pairs('aragonese_word_pairs.txt')

# Train the model
model = train_pluralisation_model(word_pairs)

# Testing the function with new singular nouns
test_words = ['concordau', 'chicolat', 'chunta', 'eclix', 'ferfet', 'bal', 'banquet', 'clot', 'banvanau', 'lau', 'crau', 'glet', 'felix']
predicted_plurals = [predict_plural(model, word) for word in test_words]

# Print the results
for singular, plural in zip(test_words, predicted_plurals):
    print(f"{singular} - {plural}")
    print('')

concordau - concordaus

chicolat - chicolatz

chunta - chuntas

eclix - eclixes

ferfet - ferfetz

bal - bals

banquet - banquetz

clot - clotz

banvanau - banvanaus

lau - laus

crau - craus

glet - gletz

felix - felixes

