In [None]:
import spacy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from collections import Counter
import numpy as np
import pickle
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

with open("data/train_processed.pkl", "rb") as f:
    qa_pairs = pickle.load(f)


In [None]:
#question words/phrases
qw_to_idx = dict()
for idx, qw in enumerate(set([qa[2] for qa in qa_pairs])):
    qw_to_idx[qw] = idx
print(qw_to_idx)

#split train test
qa_train, qa_test = train_test_split(
    qa_pairs, test_size=0.15, random_state=42
)


In [None]:

#pos vocab
all_pos_tags = []

for qa in qa_train:
    doc_a = nlp(qa[1])
    all_pos_tags.extend([token.tag_ for token in doc_a])

pos_vocab = list(set(all_pos_tags))
pos_to_idx = {pos: idx for idx, pos in enumerate(pos_vocab)}


In [None]:

def extract_pos_features(answer):
    """
    Create features from pos by by encoding the number of times a tag appears

    Args:
        answer: str answer input

    Returns:
        feature vector
    
    """
    doc_a = nlp(answer)
    pos_tags = [token.tag_ for token in doc_a]
    pos_counts = Counter(pos_tags)
    features = np.zeros(len(pos_vocab)
    )
    for pos, count in pos_counts.items():
        if pos in pos_to_idx:
            features[pos_to_idx[pos]] = count
    return features


In [None]:
#create features
X_train = np.array([extract_pos_features(qa[1]) for qa in qa_train])
y_train = np.array([qw_to_idx[qa[2]] for qa in qa_train])
X_test = np.array([extract_pos_features(qa[1]) for qa in qa_test])
y_test = np.array([qw_to_idx[qa[2]] for qa in qa_test])


In [None]:
#Train
model = RandomForestClassifier(n_estimators=1000, max_depth=30, random_state=42)
model.fit(X_train, y_train)


In [None]:
#test
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")

accuracy2 = accuracy_score(y_test, [5 for i in range(len(y_test))])
f12 = f1_score(y_test, [5 for i in range(len(y_test))], average="weighted")

plot_confusion_matrix(y_test, y_pred, ALL_QW, 'confusion_matrix.png')
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation F1 Score: {f1:.4f}")

print(f"Validation Accuracy baseline: {accuracy2:.4f}") #baseline guess 'what' all the time
print(f"Validation F1 Score baseline: {f12:.4f}")
