In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import asyncpg

In [None]:
con = await asyncpg.connect(database='quillpapers', user='postgres', password='postgres')

In [None]:
async def fetch_as_dataframe(con: asyncpg.Connection, query: str, *args):
    stmt = await con.prepare(query)
    columns = [a.name for a in stmt.get_attributes()]
    data = await stmt.fetch(*args)
    return pd.DataFrame(data, columns=columns)

In [None]:
train_data = await fetch_as_dataframe(con, "SELECT * FROM questions WHERE subject_code = '9702' AND NOT topic = 0")
test_data = await fetch_as_dataframe(con, "SELECT * FROM questions WHERE subject_code = '9702' AND topic = 0")


In [None]:
test_data

In [None]:
sw = [w.rstrip() for w in open("physics_stopwords.txt", 'r').readlines()]

In [None]:
def cleanup(t):
    t = t.lower()
    s = ''

    for ch in t:
        s += ch if ch.isalpha() else ' '
    return s

def tokenize(t):
    return t.split()

from sklearn.feature_extraction.text import CountVectorizer


In [None]:
cv = CountVectorizer(preprocessor=cleanup, tokenizer=tokenize, stop_words=sw)

In [None]:
X_train = cv.fit_transform(train_data["question_text"])
y_train = train_data["topic"]
X_test = cv.transform(test_data["question_text"])
y_test = test_data["topic"]

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)

In [None]:
test_data["pred"] = pred

In [None]:
querystring = """
UPDATE questions SET topic = $1 
WHERE subject_code = $2 
    AND exam_year = $3 
    AND series = $4 
    AND paper_variant = $5 
    AND question_number = $6
"""

for row in test_data.iterrows():
    row_data = row[-1]
    await con.execute(querystring, row_data["pred"], row_data["subject_code"], row_data["exam_year"], row_data["series"], row_data["paper_variant"], row_data["question_number"])
