In [4]:
from flashtext import KeywordProcessor

In [5]:
from snorkel.labeling import labeling_function

In [41]:
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

import numpy as np

from tqdm import tqdm
tqdm.pandas()

In [6]:
from dotenv import load_dotenv
load_dotenv()

True

In [7]:
import pandas as pd

Science, Technology, Computers
Philosophy and Religion
Politics
Law Enforcement
Military
Sports
Lifestyle, Food and Home
Finance and Economics
Medical

In [9]:
df = pd.read_parquet("sample_50k_docs.parquet")

In [10]:
df.columns

Index(['docid', 'link_hash', 'source_url', 'url', 'title', 'text', 'keywords',
       'meta_keywords', 'tags', 'authors', 'publish_date', 'summary',
       'article_html', 'meta_description', 'meta_lang', 'canonical_link',
       'source_domain', 'source_brand', 'source_description', 'source_json'],
      dtype='object')

In [None]:
title
text


In [86]:

keyword_processor = KeywordProcessor()
keyword_dict = {
    "Science, Technology, Computers": ["data science", "artificial intelligence", "virtual reality", "graphics card", "GPU", "CPU", "python", "java", "robotics"],
    "Philosophy and Religion": ["Vatican", "Pope", "Cardinal", "muslim", "catholic", "jewish", "philosopher"],
    "Politics": ["republican", "democrat", "president", "prime minister", "parliament"],
    "Legality, Law Enforcement, rule of law": ["lawyer", "lawsuit", "police", "shooting", "body cam"],
    "Military": ["war", "battle", "bomber"],
    "Sports": ["tennis", "hockey", "football", "soccer"],
    "Lifestyle, Food, and Home": ["recipe", "therapy", "lgbtq", "lgbt", "lgbtqia", "garden"],
    "Finance and Economics": ["economics", "inflation", "macroeconomics", "economist", "stocks"],
    "Medical": ["coronavirus", "covid", "doctor", "medical", "hospital", "drug", "cancer", "genetic", "DNA"]
}

In [39]:
# {'clean_name': ['list of unclean names']}
keyword_processor.add_keywords_from_dict(keyword_dict)

In [40]:

# Or add keywords from a list:
# keyword_processor.add_keywords_from_list(["java", "python"])
list(set(keyword_processor.extract_keywords('SFPD releases body cam footage of deadly June 22 police shooting')))
# output ['product management', 'java']

['Legality, Law Enforcement, rule of law']

In [34]:
df['title'].sample(1).reset_index().iloc[0]['title']

'SFPD releases body cam footage of deadly June 22 police shooting'

In [42]:
ABSTAIN = -1

In [47]:
len([])

0

In [56]:
@labeling_function()
def flashtext_keywords(x):
    """Many spam comments talk about 'my channel', 'my video', etc."""

    x = list(set(keyword_processor.extract_keywords(x)))
    
    return x[0] if len(x)>0 else ABSTAIN

In [57]:
# Define the set of labeling functions (LFs)
lfs = [flashtext_keywords]

In [58]:
applier = PandasLFApplier(lfs)

In [59]:
L_train = applier.apply(df)

  0%|                                                                              | 1/50000 [00:00<00:26, 1907.37it/s]


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [60]:
df['flashtext'] = df['title'].apply(flashtext_keywords)

In [62]:
df['flashtext'].value_counts()

flashtext
-1                                        45608
Legality, Law Enforcement, rule of law     1540
Military                                    799
Politics                                    788
Finance and Economics                       410
Sports                                      351
Lifestyle, Food, and Home                   287
Philosophy and Religion                     170
Science, Technology, Computers               47
Name: count, dtype: int64

In [65]:
list(keyword_dict.keys())

['Science, Technology, Computers',
 'Philosophy and Religion',
 'Politics',
 'Legality, Law Enforcement, rule of law',
 'Military',
 'Sports',
 'Lifestyle, Food, and Home',
 'Finance and Economics']

In [76]:
class_list = list(keyword_dict.keys())

In [67]:
label_emb = embeddings = model.encode(class_list, convert_to_tensor=False)

In [81]:
@labeling_function()
def zero_shot(input_text):
    query_emb = model.encode(input_text, convert_to_tensor=False)

    index = np.argmax([float(x[0]) for x in util.cos_sim(label_emb, query_emb)])
    
    return class_list[index]

In [84]:
df['zero_shot'] = df['title'].progress_apply(zero_shot)

100%|████████████████████████████████████████████████████████████████████████████| 50000/50000 [08:25<00:00, 98.87it/s]


In [85]:
df['zero_shot'].value_counts()

zero_shot
Legality, Law Enforcement, rule of law    9040
Politics                                  8929
Sports                                    8409
Lifestyle, Food, and Home                 7858
Finance and Economics                     6242
Military                                  5124
Science, Technology, Computers            2391
Philosophy and Religion                   2007
Name: count, dtype: int64