In [None]:
import spacy
from tqdm.auto import tqdm
from spacy.tokens import DocBin
from ml_datasets import imdb
import requests
import csv
train_data, valid_data = imdb()
nlp = spacy.load("en_core_web_sm")

In [None]:
def get_sample_wikipedia():
    url = "https://olivers-things.s3.amazonaws.com/sample-wikipedia.json"
    articles = requests.get(url).json()
    print(f"fetched {len(articles)} articles")
    return articles

def get_sample_wikipedia_untrained():
    exclude = [a[2] for a in get_train_data()]
    return [a for a in get_sample_wikipedia() if a["title"] not in exclude]

def get_train_data():
    url = "https://docs.google.com/spreadsheets/u/1/d/1ysgN8UoVY942gPVToxIEML_I-q27d9zTO6hAxoW5Yx4/export?format=csv&gid=0"
    tags = requests.get(url)
    decoded = tags.content.decode('utf-8')
    results = []
    
    for row in list(csv.reader(decoded.splitlines(), delimiter=','))[1:]:
        try:
            match = [a for a in articles if row[0] == a["title"]][0]
            title = match["title"]
            content = " ".join(match["sentences"])

            if row[1] == "0":
                results.append((content,"neg",title))
            elif row[1] == "1":
                results.append((content,"pos",title))
        except Exception as e:
            print(f"Error with {row[0]}")
            
    return results

def make_docs(data):
    """
    this will take a list of texts and labels
    and transform them in spacy documents
    data: list(tuple(text, label))
    returns: List(spacy.Doc.doc)
    """
    docs = []
    print(f"training on {len(data)} docs")
    
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        if label == 'neg':
            doc.cats["positive"] = 0
            doc.cats["negative"] = 1
        else:
            doc.cats["positive"] = 1
            doc.cats["negative"] = 0

        docs.append(doc)
    return docs

***
***CREATE MODEL***
***

In [None]:
articles = get_sample_wikipedia()

train_data = get_train_data()

In [None]:
num_texts = 100
# first we need to transform all the training data
train_docs = make_docs(train_data[:num_texts])
print(f"training on {len(train_docs)} documents")
# then we save it in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./data/train.spacy")
# repeat for validation data
valid_docs = make_docs(valid_data[:num_texts])
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("./data/valid.spacy")

***
***TEST MODEL***
***

In [None]:
nlp = spacy.load("output/model-best")

In [None]:
untrained = get_sample_wikipedia_untrained()

In [None]:
def wikipedia_link(title):
    return f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"

for a in untrained[0:50]:
    doc = nlp(" ".join(a["sentences"]))
    result = 1 if doc.cats['positive'] > 0.5 else 0
    print(wikipedia_link(a["title"]), result)

In [None]:
from werkzeug.wrappers import Request, Response
from werkzeug.serving import run_simple
from flask import Flask, request
import spacy

nlp = spacy.load("output/model-best")

def classify_wikipedia_article(article):
    doc = nlp(" ".join(article["sentences"]))
    article["result"] = 1 if doc.cats['positive'] > 0.5 else 0
    return article

app = Flask(__name__)

@app.route("/classify", methods=["POST"])
def classify():
    classified = [classify_wikipedia_article(article) for article in request.json]
    for article in classified:
        if article["result"] == 0:
            print(f"skipping {article['title']}")
        else:
            print(f"parsing {article['title']}")
    return "hi"

if __name__ == '__main__':
    run_simple('localhost', 9000, app)