In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def load_data(csv_file, split=0.9):
    data = pd.read_csv(csv_file,nrows=100)
    
    # Shuffle data
    train_data = data.sample(frac=1, random_state=7)
    
    texts = train_data.text.values
    labels = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)}
              for y in train_data.sentiment.values]
    split = int(len(train_data) * split)
    
    train_labels = [{"cats": labels} for labels in labels[:split]]
    val_labels = [{"cats": labels} for labels in labels[split:]]
    
    return texts[:split], train_labels, texts[split:], val_labels

train_texts, train_labels, val_texts, val_labels = load_data('/content/yelp_ratings.csv')

In [None]:
print('Texts from training data\n------')
print(train_texts[:2])
print('\nLabels from training data\n------')
train_labels[:2]

Texts from training data
------
['I called the number provided and same day they showed up to my residence for an inspection. Was able to advise me of multiple broken and loose tiles on my roof and came out just a few days later and completed the repairs. Quoted me a good price for the repairs as well. No complaints!'
 "ended up here because Raku was closed and it received great ratings on Yelp.  I'm so glad I came here.  One of the better meals I've had.  Started off with the mushroom dish and the lettuce wrap.  both were amazing. the lettuce wrap is like having a flavor party in your mouth.  also had the panang duck which was terrific. highly recommend all three dishes. one dish that wasn't so good was the seabass with drunken noodles. overall it was an excellent meal, intimate setting, and great service. definitely will be back."]

Labels from training data
------


[{'cats': {'NEGATIVE': False, 'POSITIVE': True}},
 {'cats': {'NEGATIVE': False, 'POSITIVE': True}}]

In [None]:
pip install -U spacy==2.3.6

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import spacy

# Create an empty model
nlp = spacy.blank("en")

# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
              "textcat",
              config={
                "exclusive_classes": True,
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

# Add labels to text classifier
textcat.add_label("NEGATIVE")
textcat.add_label("POSITIVE")


1

In [None]:
from spacy.util import minibatch
import random

def train(model, train_data, optimizer):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    
    batches = minibatch(train_data, size=8)
    for batch in batches:
        # train_data is a list of tuples [(text0, label0), (text1, label1), ...]
        # Split batch into texts and labels
        texts, labels = zip(*batch)
        
        # Update model with texts and labels
        model.update(texts, labels, sgd=optimizer, losses=losses)
        
    return losses
    

In [None]:
# Fix seed for reproducibility
spacy.util.fix_random_seed(1)
random.seed(1)

optimizer = nlp.begin_training()
train_data = list(zip(train_texts, train_labels))
losses = train(nlp, train_data, optimizer)
print(losses['textcat'])

0.03855263814330101


In [None]:
text = "This tea cup was full of holes. Do not recommend."
doc = nlp(text)
print(doc.cats)

{'NEGATIVE': 0.4575085937976837, 'POSITIVE': 0.5424913763999939}


In [None]:
#Step 4: Making Predictions

In [None]:
def predict(model, texts): 
    # Use the model's tokenizer to tokenize each input text
    docs = [model.tokenizer(train_text) for train_text in texts]
    
    # Use textcat to get the scores for each doc
    textcat = model.get_pipe('textcat')
    scores, _ = textcat.predict(docs)
    
    # From the scores, find the class with the highest score/probability
    predicted_class = scores.argmax(axis=1)
    
    return predicted_class

In [None]:
texts = val_texts[34:36]
predictions = predict(nlp, texts)

for p, t in zip(predictions, texts):
    print(f"{textcat.labels[p]}: {t} \n")

In [None]:
#evaluate the model

In [None]:

def evaluate(model, texts, labels):
    """ Returns the accuracy of a TextCategorizer model. 
    
        Arguments
        ---------
        model: ScaPy model with a TextCategorizer
        texts: Text samples, from load_data function
        labels: True labels, from load_data function
    
    """
    # Get predictions from textcat model
    predicted_class = predict(model, texts)
    
    # From labels, get the true class as a list of integers (POSITIVE -> 1, NEGATIVE -> 0)
    true_class = [int(each['cats']['POSITIVE']) for each in labels]
    
    # A boolean or int array indicating correct predictions
    correct_predictions = predicted_class == true_class
    
    # The accuracy, number of correct predictions divided by all predictions
    accuracy = correct_predictions.mean()
    
    return accuracy

In [None]:
accuracy = evaluate(nlp, val_texts, val_labels)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9000


In [None]:
n_iters = 5
for i in range(n_iters):
    losses = train(nlp, train_data, optimizer)
    accuracy = evaluate(nlp, val_texts, val_labels)
    print(f"Loss: {losses['textcat']:.3f} \t Accuracy: {accuracy:.3f}")

Loss: 0.008 	 Accuracy: 0.900
Loss: 0.004 	 Accuracy: 0.900
Loss: 0.006 	 Accuracy: 1.000
Loss: 0.002 	 Accuracy: 1.000
Loss: 0.001 	 Accuracy: 1.000
