In [25]:
import pandas as pd
import spacy
from spacy.util import minibatch, compounding
from spacy.training.example import Example
import random
import time
import glob

# -------------------------------
# 1️⃣ Load multiple txt files
# -------------------------------
file_paths = glob.glob("/content/*.txt")  # folder containing your txt files
df_list = []

for file_path in file_paths:
    try:
        df = pd.read_csv(file_path, sep="\t", encoding="utf-8")
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, sep="\t", encoding="latin1")
    # Keep only necessary columns and drop rows with missing data
    df = df[["Question", "DifficultyFromAnswerer"]].dropna()
    df_list.append(df)

# Combine all data into one DataFrame
df_all = pd.concat(df_list, ignore_index=True)
print(f"Loaded {len(df_all)} questions from {len(file_paths)} files.")

# -------------------------------
# 2️⃣ Prepare training data
# -------------------------------
train_texts = df_all["Question"].tolist()[:5000]  # sample for faster fine-tuning
train_labels = df_all["DifficultyFromAnswerer"].tolist()[:5000]
labels = df_all["DifficultyFromAnswerer"].unique().tolist()
print("Difficulty labels:", labels)

# -------------------------------
# 3️⃣ Load pre-trained English model and add textcat
# -------------------------------
nlp = spacy.load("en_core_web_sm")  # load pre-trained small English model

# Add text classifier if not present
if "textcat" not in nlp.pipe_names:
    textcat = nlp.add_pipe("textcat", last=True)
else:
    textcat = nlp.get_pipe("textcat")

# Add labels
for label in labels:
    textcat.add_label(label)

# -------------------------------
# 4️⃣ Prepare training examples
# -------------------------------
examples = []
for text, label in zip(train_texts, train_labels):
    cats = {l: l == label for l in labels}
    doc = nlp.make_doc(text)
    examples.append(Example.from_dict(doc, {"cats": cats}))

# -------------------------------
# 5️⃣ Initialize the textcat component
# -------------------------------
textcat.initialize(lambda: examples)

# -------------------------------
# 6️⃣ Fine-tune model
# -------------------------------
n_iter = 40
optimizer = nlp.resume_training()  # resume training for pre-trained model
start_time = time.time()

for i in range(n_iter):
    random.shuffle(examples)
    losses = {}
    batches = minibatch(examples, size=compounding(4.0, 32.0, 1.5))
    for batch in batches:
        nlp.update(batch, sgd=optimizer, losses=losses)
    print(f"Iteration {i+1}/{n_iter} - Loss: {losses['textcat']:.4f}")

print(f"\nFine-tuning completed in {time.time() - start_time:.2f} seconds.")

# -------------------------------
# 7️⃣ Hardcoded questions for prediction
# -------------------------------
questions_to_predict = [
    "how many parts does chronicles of narnia have",
    "where does funding for the military come from",
    "who is the founder of alex and ani",
    "who wrote between the devil and the deep blue sea",
    "how many miles out is considered international waters",
    "what season does burke come back in grey's anatomy",
    "who sings this is the end of the innocence",
    "what is gecko's name on pj masks",
    "where does the young and the restless film",
    "who played lady capulet in romeo and juliet 1996",
    "who sang on don't leave me this way",
    "star trek next generation episode picard falls in love",
    "what kind of currency is used in australia",
    "who plays william conway in house of cards",
    "who is the donor in power of attorney"
]

# -------------------------------
# 8️⃣ Predict difficulty
# -------------------------------
print("\nPredictions on hardcoded questions:")
for text in questions_to_predict:
    doc = nlp(text)
    pred_label = max(doc.cats, key=doc.cats.get)
    print(f"Question: {text}")
    print(f"Predicted Difficulty: {pred_label}")
    print("-" * 50)

# -------------------------------
# 9️⃣ Save fine-tuned model
# -------------------------------
output_dir = "/content/spacy_difficulty_model_fine_tuned_40iter"
nlp.to_disk(output_dir)
print(f"Fine-tuned model saved to {output_dir}")


Loaded 3418 questions from 3 files.
Difficulty labels: ['easy', 'medium', 'hard', 'too hard', 'too easy']
Iteration 1/40 - Loss: 12.9715
Iteration 2/40 - Loss: 9.7298
Iteration 3/40 - Loss: 7.8758
Iteration 4/40 - Loss: 6.7364
Iteration 5/40 - Loss: 6.1772
Iteration 6/40 - Loss: 5.7118
Iteration 7/40 - Loss: 5.3931
Iteration 8/40 - Loss: 5.1552
Iteration 9/40 - Loss: 4.8748
Iteration 10/40 - Loss: 4.7087
Iteration 11/40 - Loss: 4.5545
Iteration 12/40 - Loss: 4.4594
Iteration 13/40 - Loss: 4.4223
Iteration 14/40 - Loss: 4.2232
Iteration 15/40 - Loss: 4.1119
Iteration 16/40 - Loss: 3.8861
Iteration 17/40 - Loss: 3.9131
Iteration 18/40 - Loss: 3.6788
Iteration 19/40 - Loss: 3.5935
Iteration 20/40 - Loss: 3.4944
Iteration 21/40 - Loss: 3.4707
Iteration 22/40 - Loss: 3.4872
Iteration 23/40 - Loss: 3.3866
Iteration 24/40 - Loss: 3.2977
Iteration 25/40 - Loss: 3.2392
Iteration 26/40 - Loss: 3.1808
Iteration 27/40 - Loss: 3.2870
Iteration 28/40 - Loss: 3.2397
Iteration 29/40 - Loss: 3.1571
Ite