In [19]:
import torch

import pandas as pd

from tqdm import tqdm

tqdm.pandas()

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

In [132]:
import yaml

In [209]:
with open('queries.yaml', 'r') as file:
    queries = yaml.safe_load(file)

In [4]:
df = pd.read_csv(r"D:\Work\Data\medium_articles.csv")

In [9]:
df = df.head(50000)

In [11]:
df['title'] = df['title'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title'] = df['title'].astype(str)


In [12]:
df = df[df['title'].notnull()]

df['title_len'] = df['title'].apply(lambda x: len(x.split(" ")))

df = df[df['title_len']>2]

In [211]:
df = df.reset_index()

In [13]:
df['emb'] = df['title'].progress_apply(model.encode)

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [08:08<00:00, 102.31it/s]


In [212]:
def query_df(query, k=10):
    top_k = min(k, len(df))
    
    query_embedding = model.encode(query)
    cos_scores = util.cos_sim(query_embedding, df['emb'])[0]
    top_results = torch.topk(cos_scores, k=top_k)
    
#     if print==True:
#     for score, idx in zip(top_results[0], top_results[1]):
#         print(f"{score:.4} -", df['title'][int(idx)])
        
    return [int(x) for x in top_results[1]]

def add_tags(query, tag, k=50):
    rel_indexes = query_df(query=query, k=k)
    df.iloc[rel_indexes]['tags'].apply(lambda x: x.append(tag))
    
    return df

In [None]:
df['tags'] = [[] for x in range(0, len(df))]

for tag in queries:
    print(tag)
    
    for q in queries[tag]:
        print("\t", q)
        df = add_tags(q, tag, k=100)
        
df['tags'] = df['tags'].apply(lambda x: list(set(x)))

In [214]:
df['tags'].str.len().value_counts()

0    39570
1     7027
2      490
3       40
4        4
5        1
Name: tags, dtype: int64

In [219]:
df[df['tags'].str.len()>2][['title_len', 'title', 'url', 'tags']].sample(10)

Unnamed: 0,title_len,title,url,tags
35722,3,Remote Proceeding Pioneers,https://medium.com/g21c/remote-proceeding-pion...,"[TECH, MIL, SCI]"
21522,3,Learning in Nature,https://medium.com/wwfhk-e/learning-in-nature-...,"[SCI, ECO, SOC]"
44303,4,Love and The Universe,https://medium.com/mindpload/love-and-the-univ...,"[TECH, REL, SCI]"
28320,4,Fighting Fire with Fire,https://medium.com/sprinter-hq/fighting-fire-w...,"[MIL, HLTH, SOC]"
44343,4,Global warming. Global warming,https://medium.com/resistance-poetry/-532a644a...,"[ECO, HLTH, POL, SCI]"
3844,3,Women and AI.,https://medium.com/carre4/women-and-ai-a8389ec...,"[TECH, MIL, SOC]"
30210,3,Knowing the Universe,https://medium.com/change-your-mind/knowing-th...,"[TECH, REL, SCI]"
10460,5,"Physics, Life, and Everything Nice",https://medium.com/snipette/physics-life-and-e...,"[TECH, REL, SCI]"
18189,4,I Want to Become,https://medium.com/poets-unlimited/i-want-to-b...,"[SCI, MIL, SOC]"
42052,9,"Science, Sustainability, Business, Politics, a...",https://medium.com/bioeconomy-xyz/science-sust...,"[SCI, ECON, POL, SOC]"


In [226]:
for tag in queries:
    print(f"{tag}: {len(df[df['tags'].astype(str).str.contains(tag)]):,} docs")

SOC: 1,447 docs
TECH: 2,623 docs
ECON: 1,085 docs
POL: 614 docs
ECO: 1,388 docs
MIL: 528 docs
SCI: 542 docs
REL: 503 docs
HLTH: 500 docs


In [227]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

In [228]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X, y = make_classification(n_samples=1000, n_features=4,
    n_informative=2, n_redundant=0,
    random_state=0, shuffle=False)

In [229]:
clf = RandomForestClassifier(max_depth=2, random_state=0)

In [223]:
len(df[df['tags'].astype(str).str.contains('TECH')])

2623

In [208]:
df[df['tags'].str.len()==0][['title', 'tags']].sample(10)

Unnamed: 0,title,tags
15235,Designing a Parsing Library in Scala,[]
28226,The Pittsburgh Shooting Was an Attack on a Min...,[]
10785,What I learnt from not drinking,[]
30941,Beginners Guide to Kucoin Exchange,[]
29979,"Back to School with Planet, Week 3 | 3, 2, 1, ...",[]
31252,Dragon Quest 1 Nintendo Switch Review,[]
30080,Why You Struggle To Get Job Offers as a Genera...,[]
1728,10 Efficient Ways to Use Python Lists,[]
643,What Famous Startup Taglines Can Teach You Abo...,[]
613,Creating a Daily Writing Habit Means Being the...,[]
