Task: try to classify lines of Latin poetry by their metrical scheme.

In [None]:
import os
import pandas as pd
from lxml import etree
import re

def parse_xml_to_dataframe(xml_folder = "/content/drive/MyDrive/ScannedPoetry"): #default colab path
    data = []

    # Loop through all XML files in the folder
    for filename in os.listdir(xml_folder):
        if filename.endswith(".xml"):
            file_path = os.path.join(xml_folder, filename)
            tree = etree.parse(file_path)
            root = tree.getroot()

            # Extract all <line> elements
            for line in root.xpath(".//body/line"):
                words = " ".join(word.text for word in line.xpath(".//word") if word.text)  # Concatenate words
                words = words.lower()
                # Remove non-alphabetic characters
                words = re.sub(r'[^a-z\s]', '', words)

                meter = line.get("meter", "")  # Get the 'meter' attribute
                if words and line.get("pattern") != "not scanned" and line.get("name"):  # Skip lines without words
                    data.append([words, meter])

    # Create a DataFrame
    df = pd.DataFrame(data, columns=["Text", "Meter"])

    mylist = df["Meter"].tolist()
    char_to_int = {char: idx for idx, char in enumerate(dict.fromkeys(mylist))}

    # Map characters in the input list to their encoded values
    encoded = [char_to_int[char] for char in mylist]
    df["Meter"] = encoded
    return df

# Folder containing the XML files
xml_folder = r"\Users\abhin\OneDrive\Documents\LIN371\ScannedPoetry"

# Generate DataFrame
df = parse_xml_to_dataframe(xml_folder)
print(df.head)
print(df['Meter'].value_counts())


Upscale the data, but keep the number of rows in the dataframe the same as the original.

In [None]:
from sklearn.utils import resample
#####SMALL DF#####



# Count the number of rows in the original dataset
unique_classes = df['Meter'].nunique()
total_rows = len(df)

# Calculate the number of rows each class should have to ensure equal frequency
target_rows_per_class = total_rows // unique_classes

# Resample each class to have the same number of rows
frames = []
for cls in df['Meter'].unique():
    cls_data = df[df['Meter'] == cls]
    cls_upsampled = resample(
        cls_data,
        replace=True,
        n_samples=target_rows_per_class,
        random_state=42
    )
    frames.append(cls_upsampled)

# Combine all the resampled classes into a single DataFrame
df_balanced = pd.concat(frames)

# Shuffle the final dataset to ensure randomness
df_small = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Final check
print(df_small['Meter'].value_counts())
print(df_small.head())
print(f"Total rows in the new balanced DataFrame: {len(df_small)}")

Upscale the data, but make the number of data in each class equal to that in the maximum class.

In [None]:
from sklearn.utils import resample
###UPSCALED DF###



majority_class = df['Meter'].value_counts().idxmax()
minority_classes = df['Meter'].value_counts()[df['Meter'].value_counts() < df['Meter'].value_counts().max()]

# Resample minority classes
frames = [df]
for cls, count in minority_classes.items():
    minority = df[df['Meter'] == cls]
    minority_upsampled = resample(
        minority,
        replace=True,  # Sample with replacement
        n_samples=df['Meter'].value_counts()[majority_class],  # Match majority class size
        random_state=42
    )
    frames.append(minority_upsampled)

# Combine back into a balanced dataset
df_upscaled = pd.concat(frames)

print(df_upscaled['Meter'].value_counts())
print(df_upscaled.head())

Implement cross-validation.

In [None]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer
from sklearn.model_selection import cross_val_score

def classification_report_with_accuracy_score(y_true, y_pred):

    print(classification_report(y_true, y_pred)) # print classification report
    return accuracy_score(y_true, y_pred) # return accuracy score

Implement test-train split with a random state for consistency.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.datasets import load_files
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm


# load data using sklearn.datasets.load_files
def split_data(df):
# split the data into train and test
    X = df["Text"]
    y = df["Meter"]
    docs_train, docs_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

    # vectorize the training data
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(docs_train)

    X_test = vectorizer.transform(docs_test)

    return X_train, X_test, y_train, y_test



Naive Bayes Implementation:

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report


X_train, X_test, y_train, y_test = split_data(df_upscaled)

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_hat_nb = nb_model.predict(X_test)

accuracy_score_nb = metrics.accuracy_score(y_test, y_hat_nb)





scores = cross_val_score(nb_model, X_train, y_train, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
#print(scores)
print(scores.mean(), scores.std())

Random Forest Implementation:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report



X_train, X_test, y_train, y_test = split_data(df_upscaled)

clf_rf = RandomForestClassifier(n_estimators=100, max_depth=100)
clf_rf.fit(X_train, y_train)
preds_rf = clf_rf.predict(X_test)


scores = cross_val_score(clf_rf, X_train, y_train, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores.mean(), scores.std())


Light GBM Implementation:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report




X_train, X_test, y_train, y_test = split_data(df_upscaled)

X_train_new = X_train.astype('float32')
X_test_new = X_test.astype('float32')


train_data = lgb.Dataset(X_train_new, label=y_train)
test_data = lgb.Dataset(X_test_new, label=y_test, reference=train_data)

params = {
    'objective': 'multiclass',  # Multiclass classification
    'num_class': len(df['Meter'].unique()),  # Number of classes
    'metric': 'multi_logloss',  # Evaluation metric for multiclass
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'verbosity': -1
}

lgb_model = lgb.train(params, train_data, num_boost_round=100)

# Predict on test set
y_pred = lgb_model.predict(X_test_new)  # Probabilities for each class
y_pred_labels = y_pred.argmax(axis=1)  # Convert probabilities to class labels

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_labels)
#print(classification_report(y_test, y_pred_labels))
#print()
#print(accuracy)



model = LGBMClassifier(objective='multiclass', num_class=len(df['Meter'].unique()))
docs_train = df_upscaled["Text"]
y_train = df_upscaled["Meter"]

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(docs_train).astype('float32')

scores = cross_val_score(model, X_train, y_train, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores.mean(), scores.std())



Multilingual BERT implementation:

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch



# Load the multilingual model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(df['Meter'].unique()))


X_train, X_test, y_train, y_test = train_test_split(df_small["Text"].tolist(), df_small["Meter"].tolist(), test_size=0.2)


# Tokenize
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset class
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, y_train)
test_dataset = SentimentDataset(test_encodings, y_test)

from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

from tqdm import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Training loop
epochs = 3
progress_bar = tqdm(range(num_training_steps))

for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())



print(classification_report(all_labels, all_preds))
print()
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy}")


model.save_pretrained("./bert_classifier")
tokenizer.save_pretrained("./bert_classifier")
