In [1]:
import spacy

In [2]:
print(spacy.__version__)

2.3.5


In [2]:
import random

In [3]:
from spacy.util import minibatch, compounding

In [5]:
nlp.Defaults.stop_words
# stop words are words that have no significance to machines

NameError: name 'nlp' is not defined

In [2]:
text = """
Dave watched as the forest burned up on the hill,
only a few miles from his house. The car had
been hastily packed and Marta was inside trying to round
up the last of the pets. "Where could she be?" he wondered
as he continued to wait for Marta to appear with the pets.
"""

In [4]:
nlp = spacy.load("en_core_web_sm")

In [15]:
nlp = spacy.load("en_core_web_sm")
document = nlp(text)
tokens = []
filtered_tokens = []
for i in document:
    tokens.append(i)
print(tokens)
for i in document:
    if not i.is_stop:
        filtered_tokens.append(i)
print(filtered_tokens)

[
, Dave, watched, as, the, forest, burned, up, on, the, hill, ,, 
, only, a, few, miles, from, his, house, ., The, car, had, 
, been, hastily, packed, and, Marta, was, inside, trying, to, round, 
, up, the, last, of, the, pets, ., ", Where, could, she, be, ?, ", he, wondered, 
, as, he, continued, to, wait, for, Marta, to, appear, with, the, pets, ., 
]
[
, Dave, watched, forest, burned, hill, ,, 
, miles, house, ., car, 
, hastily, packed, Marta, inside, trying, round, 
, pets, ., ", ?, ", wondered, 
, continued, wait, Marta, appear, pets, ., 
]


In [4]:
def load_training_data(directory = "aclImdb/train", split = 0.8, limit=0):
    reviews =[]
    for label in ["pos", "neg"]:
        data_directory = directory+"/"+label.replace(" ","")
        for review in os.listdir(data_directory):
            if review.endswith(".txt"):
                thefile= data_directory+"/"+review.replace(" ","")
                f = open(thefile, encoding='utf-8')
                text = f.read()
                text = text.replace("<br />", "\n\n")
                if text.strip():
                    spacy_label= {
                        "cats": {
                            "pos": "pos" == label,
                            "neg": "neg" == label
                        }
                    }
                    reviews.append((text, spacy_label))
    random.shuffle(reviews)

    if limit:
        reviews = reviews[:limit]
    split = int(len(reviews)*split)
    return reviews[:split], reviews[split:]

In [5]:
def train_model(training_data, test_data, iter = 20):
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat =nlp.get_pipe("textcat")
    textcat.add_label("pos")
    textcat.add_label("neg")
    excluded_pipes = []
    for pipe in nlp.pipe_names:
        if pipe != "textcat":
            excluded_pipes.append(pipe)   
    with nlp.disable_pipes(excluded_pipes):
        optimizer = nlp.begin_training()
        print("Beginning training")
        print("Loss\tPrecision\tRecall\tF Score")        
        batch_sizes = compounding(4.0, 32.0, 1.001) 

        for i in range(iter):
            loss={}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, label = zip(*batch)
                nlp.update(text, label, drop=0.2, sgd=optimizer, losses=loss)
            with textcat.model.use_params(optimizer.averages):
                eval_results = evaluate(token=nlp.tokenizer, textcat=textcat, test_data=test_data)
                print(
                    f"{loss['textcat']}\t{eval_results['precision']}"
                    f"\t{eval_results['recall']}"
                    f"\t{eval_results['f-score']}"
                )
    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")



In [6]:
def evaluate(token, textcat, test_data):
    reviews, labels = zip(*test_data)
    reviews = (token(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8  # Can't be 0 because of presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]["cats"]
        for predicted_label, score in review.cats.items():
            # Every cats dictionary includes both labels. You can get all
            # the info you need with just the pos label.
            if (
                predicted_label == "neg"
            ):
                continue
            if score >= 0.5 and true_label["pos"]:
                true_positives += 1
            elif score >= 0.5 and true_label["neg"]:
                false_positives += 1
            elif score < 0.5 and true_label["neg"]:
                true_negatives += 1
            elif score < 0.5 and true_label["pos"]:
                false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    
    if (precision + recall == 0):
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}            

In [7]:
TEST_REVIEW = """
Transcendently beautiful in moments outside the office, it seems almost
sitcom-like in those scenes. When Toni Colette walks out and ponders
life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
whether it's slapstick, farce, magical realism, or drama, but the best of it
doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)
"""

In [14]:
TEST_REVIEW = """
Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in.
"""

In [11]:
def test_model(input_data):
    loaded_model = spacy.load("model_artifacts")
    parsed_text = loaded_model(input_data)
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Positive"
        score = parsed_text.cats["pos"]
    else:
        prediction = "Negative"
        score = parsed_text.cats["neg"]
    print(
        f"Review text: {input_data}\nPredicted sentiment: {prediction}"
        f"\tScore: {score}"
    )

In [17]:
train, test = load_training_data(limit=25)
train_model(train,test)
print("Testing model")
test_model(TEST_REVIEW)

Beginning training
Loss	Precision	Recall	F Score
0.20788690075278282	0.9999999900000002	0.9999999900000002	0.9999999900000002
0.14933563582599163	0.9999999900000002	0.9999999900000002	0.9999999900000002
0.1055685542523861	0.3333333322222222	0.9999999900000002	0.4999999975
0.07138955779373646	0.1999999996	0.9999999900000002	0.3333333322222222
0.039759696228429675	0.0	0.0	0
0.025101906736381352	0.0	0.0	0
0.013253693177830428	0.0	0.0	0
0.006646072382864077	0.249999999375	0.9999999900000002	0.3999999984
0.007640088972038939	0.249999999375	0.9999999900000002	0.3999999984
0.0006134375653346069	0.0	0.0	0
0.0016132568098328193	0.0	0.0	0
0.0005215432281602261	0.0	0.0	0
0.0005037493647250813	0.249999999375	0.9999999900000002	0.3999999984
0.0005321603803167818	0.249999999375	0.9999999900000002	0.3999999984
0.00011954715273532202	0.249999999375	0.9999999900000002	0.3999999984
0.00015557168262603227	0.249999999375	0.9999999900000002	0.3999999984
8.997715627856451e-05	0.0	0.0	0
0.0002338384566655804