# Binary Classification with Spacy Categorizer

While there are many different models available for binary classification tasks, spaCy, a natural language processing module also offers it's own trainable model, specifically tailored to text / language data.

This spaCy function is designed for multi-lablel text classification tasks. It's dependent on training the spaCy model on provided data and produce classification labels on provided text.

In [1]:
import pandas as pd
import time
import numpy as np
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
import dropbox
import io
import os 

In [2]:
# load the english model
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_trf')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [20]:
dropbox_access_token = 'sl.B0dlxXsEpFckNW5wQQm_18DoQZEJ0nh17I-TGUvUuoU5vaeLaMY5a8fKIwyH1cyYQfGn3GX06YMuJN1Zzl1Ja7b-UXxQH-I8OLzJylOIX86bQSmXtH4PC7jqICOdKKt5FTTIzSq9Ndor'

# Initialize Dropbox client
dbx = dropbox.Dropbox(dropbox_access_token)

# Dropbox file path
dropbox_file_path = '/fake_real_ML_project_dataset.csv'

# Download the file
metadata, response = dbx.files_download(dropbox_file_path)

# Read the CSV file from the response content using io.BytesIO
df = pd.read_csv(io.BytesIO(response.content))

# Show the results
print("Dataframe loaded from Dropbox:")
print(df.head())

Dataframe loaded from Dropbox:
   Unnamed: 0                                            Content  \
0           0  new york reuters us environmental group sierra...   
1           1  washington reuters us air force asked industry...   
2           2  saturday paul ryan posted photo instagram phot...   
3           3  america keeps waiting word hillary indicted ob...   
4           4                   religion peace ht weasel zippers   

          Title  Type  
0  politicsNews  true  
1  politicsNews  true  
2          News  fake  
3      politics  fake  
4     left-news  fake  


In [21]:
df = df.dropna()

In [22]:
# remove extra column
df = df.drop('Unnamed: 0', axis = 1)

In [23]:
# Separate the DataFrame into two subsets based on categories
true_df = df[df['Type'] == 'true']
fake_df = df[df['Type'] == 'fake']

# Randomly sample an equal number of rows from each subset
sampled_true = true_df.sample(n=20, replace=False, random_state=42)
sampled_fake = fake_df.sample(n=20, replace=False, random_state=42)

# Combine the sampled rows into a single DataFrame
sampled_df = pd.concat([sampled_true, sampled_fake])

# Reset the index of the sampled DataFrame
sampled_df.reset_index(drop=True, inplace=True)

In [24]:
sampled_df.head()

Unnamed: 0,Content,Title,Type
0,washington reuters united states cannot trust ...,politicsNews,True
1,new york reuters email donald trump’s campaign...,politicsNews,True
2,stockholm reuters swedish court monday sentenc...,worldnews,True
3,washington reuters us president barack obama s...,politicsNews,True
4,bogotacartagena colombia reuters pope francis ...,worldnews,True


## Creating the Text Categorizer

In [10]:
from spacy.pipeline import TextCategorizer
textcat = TextCategorizer(nlp)

TypeError: TextCategorizer.__init__() missing 1 required positional argument: 'model'

Add labels

In [None]:
textcat.add_label("fake")
textcat.add_label("true")

In [None]:
textcat.train()

In [None]:
doc = nlp("This is a positive review.")
textcat(doc)

In [18]:
import spacy
from spacy.pipeline.textcat import Config
import pandas as pd
nlp = spacy.load("en_core_web_sm")

In [19]:
# Prepare training data
train_texts = df['Content'].tolist()
train_labels = [{'cats': {'Real News': label == 'Real News', 'Fake News': label == 'Fake News'}} for label in df['Type']]

# Initialize the text categorizer with default config
textcat = nlp.create_pipe("textcat", config=Config())

# Add labels to the text categorizer (binary labels: Real News and Fake News)
textcat.add_label("Real News")
textcat.add_label("Fake News")

# Add the text categorizer to the pipeline using the string name 'textcat'
nlp.add_pipe('textcat')

# Disable other pipelines such as tagger, parser, and ner
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):
    # Training the model
    optimizer = nlp.begin_training()
    for i in range(10):  # Adjust the number of epochs as needed
        losses = {}
        for texts, annotations in zip(train_texts, train_labels):
            nlp.update([texts], [annotations], sgd=optimizer, losses=losses)
        print(losses)

# Test the trained model
doc = nlp("The economy is growing at a record pace.")
print(doc.cats)  # This will give you the probability scores for each label








ValueError: [E867] The 'textcat' component requires at least two labels because it uses mutually exclusive classes where exactly one label is True for each doc. For binary classification tasks, you can use two labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you can use the 'textcat_multilabel' component with one label.

In [31]:
import spacy
from spacy.training.example import Example
from spacy.util import minibatch
import random

# Load the English language model
nlp = spacy.load("en_core_web_sm")




In [None]:
# Prepare your training data (example)
train_data = [
    ("This is a positive example", {"cats": {"POSITIVE": 1, "NEGATIVE": 0}}),
    ("This is a negative example", {"cats": {"POSITIVE": 0, "NEGATIVE": 1}})
]

# Initialize the text categorizer component
textcat = nlp.add_pipe("textcat")
textcat.cfg["exclusive_classes"] = True
textcat.cfg["architecture"] = "simple_cnn"

# Add the binary labels to the text categorizer
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

# Train the text categorizer
random.seed(1)
optimizer = nlp.begin_training()
for epoch in range(10):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=8)
    for batch in batches:
        texts, annotations = zip(*batch)
        example = []
        for i in range(len(texts)):
            doc = nlp.make_doc(texts[i])
            example.append(Example.from_dict(doc, annotations[i]))
        nlp.update(example, losses=losses, drop=0.2, sgd=optimizer)
    print(losses)

# Evaluate the model (optional)
# Evaluate your model on a separate validation set to assess its performance