In [84]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


DATA PREPROCESSING

In [85]:
#TOKENIZING
import spacy
text = """
 Dave watched as the forest burned up on the hill,
 only a few miles from his house. The car had
 been hastily packed and Marta was inside trying to round
 up the last of the pets. "Where could she be?" he wondered
 as he continued to wait for Marta to appear with the pets.
 """

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
token_list = [token for token in doc]
token_list

[
  , Dave, watched, as, the, forest, burned, up, on, the, hill, ,, 
  , only, a, few, miles, from, his, house, ., The, car, had, 
  , been, hastily, packed, and, Marta, was, inside, trying, to, round, 
  , up, the, last, of, the, pets, ., ", Where, could, she, be, ?, ", he, wondered, 
  , as, he, continued, to, wait, for, Marta, to, appear, with, the, pets, ., 
  ]

In [86]:
#REMOVING STOP WORDS
filtered_tokens = [token for token in doc if not token.is_stop]
filtered_tokens

[
  , Dave, watched, forest, burned, hill, ,, 
  , miles, house, ., car, 
  , hastily, packed, Marta, inside, trying, round, 
  , pets, ., ", ?, ", wondered, 
  , continued, wait, Marta, appear, pets, ., 
  ]

In [87]:
#NORMALIZING WORDS
lemmas = [
     f"Token: {token}, lemma: {token.lemma_}"
     for token in filtered_tokens
]
lemmas

['Token: \n , lemma: \n ',
 'Token: Dave, lemma: Dave',
 'Token: watched, lemma: watch',
 'Token: forest, lemma: forest',
 'Token: burned, lemma: burn',
 'Token: hill, lemma: hill',
 'Token: ,, lemma: ,',
 'Token: \n , lemma: \n ',
 'Token: miles, lemma: mile',
 'Token: house, lemma: house',
 'Token: ., lemma: .',
 'Token: car, lemma: car',
 'Token: \n , lemma: \n ',
 'Token: hastily, lemma: hastily',
 'Token: packed, lemma: pack',
 'Token: Marta, lemma: Marta',
 'Token: inside, lemma: inside',
 'Token: trying, lemma: try',
 'Token: round, lemma: round',
 'Token: \n , lemma: \n ',
 'Token: pets, lemma: pet',
 'Token: ., lemma: .',
 'Token: ", lemma: "',
 'Token: ?, lemma: ?',
 'Token: ", lemma: "',
 'Token: wondered, lemma: wonder',
 'Token: \n , lemma: \n ',
 'Token: continued, lemma: continue',
 'Token: wait, lemma: wait',
 'Token: Marta, lemma: Marta',
 'Token: appear, lemma: appear',
 'Token: pets, lemma: pet',
 'Token: ., lemma: .',
 'Token: \n , lemma: \n ']

In [88]:
#VECTORIZING THE TEXT
filtered_tokens[1].vector


array([-0.22178409,  2.0363135 ,  1.1723998 ,  0.18578127,  0.04702622,
        0.58668554, -1.2009034 ,  0.3269077 ,  1.4433299 , -1.4248612 ,
       -0.21548331,  0.42028397, -1.3350626 , -0.73203874, -0.31279454,
       -0.13874963,  0.3950568 ,  0.12919296, -0.43140125, -0.34982377,
       -0.05765158, -1.1015071 ,  0.27901345,  0.84183687,  1.3245786 ,
       -0.8529462 , -0.8744372 , -1.0205163 , -0.9564352 ,  1.0853741 ,
       -0.07206851,  0.40644735, -0.498676  , -0.92382216,  0.3699388 ,
        0.29741818,  0.48972166, -0.12200204, -0.08074802,  0.76989526,
       -0.9041327 , -0.75449955, -0.04722948, -2.0991108 , -0.77466094,
        0.02502659,  0.7373609 ,  0.62764734,  0.81472605, -0.5014193 ,
        0.79458493, -0.60294414, -0.3764605 , -1.7542655 ,  0.04777254,
        1.7780645 , -0.24561684, -0.7471707 , -0.8752783 , -0.9624736 ,
       -1.2766806 , -0.65242827,  0.26527596, -0.22189498,  0.69737744,
        0.17414027, -0.11381237,  0.1534035 ,  0.773933  , -0.83

In [89]:
#LOADING DATA AND PREPROCESSING
import os

def load_training_data(
    data_directory: str = "aclImdb/train",
    split: float = 0.8,
    limit: int = 0
) -> tuple:
    # Load from files
    reviews = []
    for label in ["pos", "neg"]:
        labeled_directory = f"{data_directory}/{label}"
        for review in os.listdir(labeled_directory):
            if review.endswith(".txt"):
                with open(f"{labeled_directory}/{review}") as f:
                    text = f.read()
                    text = text.replace("<br />", "\n\n")
                    if text.strip():
                        spacy_label = {
                            "cats": {
                                "pos": "pos" == label,
                                "neg": "neg" == label
                            }
                        }
                        reviews.append((text, spacy_label))

In [90]:
#SHUFFLE YOUR DATA
import os
import random

def load_training_data(
    data_directory: str = "aclImdb/train",
    split: float = 0.8,
    limit: int = 0
) -> tuple:
    # Load from files
    reviews = []
    for label in ["pos", "neg"]:
        labeled_directory = f"{data_directory}/{label}"
        for review in os.listdir(labeled_directory):
            if review.endswith(".txt"):
                with open(f"{labeled_directory}/{review}") as f:
                    text = f.read()
                    text = text.replace("<br />", "\n\n")
                    if text.strip():
                        spacy_label = {
                            "cats": {
                                "pos": "pos" == label,
                                "neg": "neg" == label}
                        }
                        reviews.append((text, spacy_label))
    random.shuffle(reviews)

    if limit:
        reviews = reviews[:limit]
    split = int(len(reviews) * split)
    return reviews[:split], reviews[split:]
    

TRAIN YOUR CLASSIFIER

In [91]:
#ADD TEXTCAT
import os
import random
import spacy

def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20
) -> None:
    # Build pipeline
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)

In [92]:
#ADD TEXTCAT AND ADD LABELS
import os
import random
import spacy

def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20
) -> None:
    # Build pipeline
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

In [93]:
#IMPLEMENT TRAINING LOOP
import os
import random
import spacy
from spacy.util import minibatch, compounding

def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20
) -> None:
    # Build pipeline
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    # Train only textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]

TRAINING

In [94]:
#ADD CODE TO BEGIN TRAINING
import os
import random
import spacy
from spacy.util import minibatch, compounding

def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20
) -> None:
    # Build pipeline
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    # Train only textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]
    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        # Training loop
        print("Beginning training")
        batch_sizes = compounding(
            4.0, 32.0, 1.001
        )  # A generator that yields infinite series of input numbers

In [95]:
#TRAIN BATCHES OF DATA
import os
import random
import spacy
from spacy.util import minibatch, compounding

def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20
) -> None:
    # Build pipeline
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    # Train only textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]
    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        # Training loop
        print("Beginning training")
        batch_sizes = compounding(
            4.0, 32.0, 1.001
        )  # A generator that yields infinite series of input numbers
        for i in range(iterations):
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(
                    text,
                    labels,
                    drop=0.2,
                    sgd=optimizer,
                    losses=loss
                )

EVALUATING THE MODEL

In [96]:
#PASS THE REQUIRED COMPONENTS
def evaluate_model(
    tokenizer, textcat, test_data: list
) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8  # Can't be 0 because of presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]
        for predicted_label, score in review.cats.items():
            # Every cats dictionary includes both labels. You can get all
            # the info you need with just the pos label.
            if (
                predicted_label == "neg"
            ):
                continue
            if score >= 0.5 and true_label["pos"]:
                true_positives += 1
            elif score >= 0.5 and true_label["neg"]:
                false_positives += 1
            elif score < 0.5 and true_label["neg"]:
                true_negatives += 1
            elif score < 0.5 and true_label["pos"]:
                false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}

In [97]:
#DETERMINE PERFORMANCE METRICS LIKE PRECISION RECALL AND F SCORE
def train_model(training_data: list, test_data: list, iterations: int = 20):
    # Previously seen code omitted for brevity.
        # Training loop
        print("Beginning training")
        print("Loss\tPrecision\tRecall\tF-score")
        batch_sizes = compounding(
            4.0, 32.0, 1.001
        )  # A generator that yields infinite series of input numbers
        for i in range(iterations):
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(
                    text,
                    labels,
                    drop=0.2,
                    sgd=optimizer,
                    losses=loss
                )
            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(
                    tokenizer=nlp.tokenizer,
                    textcat=textcat,
                    test_data=test_data
                )
                print(
                    f"{loss['textcat']}\t{evaluation_results['precision']}"
                    f"\t{evaluation_results['recall']}"
                    f"\t{evaluation_results['f-score']}"
                )

In [98]:
#COMPLETE TRAINING AND SAVE MODEL
def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20
) -> None:
    # Build pipeline
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    # Train only textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]
    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        # Training loop
        print("Beginning training")
        print("Loss\tPrecision\tRecall\tF-score")
        batch_sizes = compounding(
            4.0, 32.0, 1.001
        )  # A generator that yields infinite series of input numbers
        for i in range(iterations):
            print(f"Training iteration {i}")
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss)
            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(
                    tokenizer=nlp.tokenizer,
                    textcat=textcat,
                    test_data=test_data
                )
                print(
                    f"{loss['textcat']}\t{evaluation_results['precision']}"
                    f"\t{evaluation_results['recall']}"
                    f"\t{evaluation_results['f-score']}"
                )

    # Save model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk("SENTIMENT_ANALYSIS_MODEL_BATCH 7")

CLASSIFYING REVIEWS

In [99]:
#LOAD THE SAVED MODEL
def test_model(input_data: str="TEST_REVIEW"):
    #  Load saved trained model
    loaded_model = spacy.load("SENTIMENT_ANALYSIS_MODEL_BATCH 7")

In [100]:
#INSERT EXAMPLE INPUT
import os
import random
import spacy
from spacy.util import minibatch, compounding

TEST_REVIEW = """
Transcendently beautiful in moments outside the office, it seems almost
sitcom-like in those scenes. When Toni Colette walks out and ponders
life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
whether it's slapstick, farce, magical realism, or drama, but the best of it
doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)
"""

In [101]:
#PASS REVIEW INTO THE MODEL
def test_model(input_data: str = TEST_REVIEW):
    #  Load saved trained model
    loaded_model = spacy.load("SENTIMENT_ANALYSIS_MODEL_BATCH 7")
    # Generate prediction
    parsed_text = loaded_model(input_data)
    # Determine prediction to return
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Positive"
        score = parsed_text.cats["pos"]
    else:
        prediction = "Negative"
        score = parsed_text.cats["neg"]
    print(
        f"Review text: {input_data}\nPredicted sentiment: {prediction}"
        f"\tScore: {score}"
    )

In [104]:
#CALL THE MAIN FUNCTION
if __name__ == "__main__":
    train, test = load_training_data(limit=2500)
    train_model(train, test)
    print("Testing model")
    test_model()

Beginning training
Loss	Precision	Recall	F-score
11.293997120810673	0.7816593886121546	0.7584745762390477	0.7698924730851658
1.979159922178951	0.8083333332996527	0.8220338982702527	0.8151260503859189
[...]
0.000415042785704145	0.7926829267970453	0.8262711864056664	0.8091286306718204

Testing model
Review text:
Transcendently beautiful in moments outside the office, it seems almost
sitcom-like in those scenes. When Toni Colette walks out and ponders
life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
whether it's slapstick, farce, magical realism, or drama, but the best of it
doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)

Predicted sentiment: Positive   Score: 0.8773064017295837
