In [27]:
import numpy as np
import importlib
import pandas as pd
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel

from Project.utils.storage import youtube_db as db

importlib.reload(db)

<module 'Project.utils.storage.youtube_db' from 'C:\\Users\\Filippo Corti\\Documents\\GitHub\\SocialMediaMining\\Project\\utils\\storage\\youtube_db.py'>

In [4]:
"""
THE YOUTUBE MACHINE LEARNING PIPELINE:

1. Build a DataFrame with [content, podcast_guest, bias, leaning, is_political, sentiment, emotion, llm_label].

2. Extract a sub-DataFrame with only rows that have llm_label.

3. Define Snorkel Labeling Functions

4. Run Snorkel on sub-DataFrame to create 50k labeled instances. We can check which labels were more important.

5. Manually label 150 instances in the entire DataFrame.

6. Create a DataFrame having [content, podcast_guest, bias, leaning, sentiment, emotion, SNORKEL_LABEL].

7. Build a Model using sklearn (Cross Validation and other stuff), training it on the DataFrame on point 6. Check Performance

8. Run the Model on the entire Dataset. Check Performance based on my 150 instances.

At the end, every comment has a label produced by my Classification Model

"""


'\nTHE YOUTUBE MACHINE LEARNING PIPELINE:\n\n1. Build a DataFrame with [content, podcast_guest, bias, leaning, is_political, sentiment, emotion, llm_label].\n\n2. Extract a sub-DataFrame with only rows that have llm_label.\n\n3. Define Snorkel Labeling Functions\n\n4. Run Snorkel on sub-DataFrame to create 50k labeled instances. We can check which labels were more important.\n\n5. Manually label 150 instances in the entire DataFrame.\n\n6. Create a DataFrame having [content, podcast_guest, bias, leaning, sentiment, emotion, SNORKEL_LABEL].\n\n7. Build a Model using sklearn (Cross Validation and other stuff), training it on the DataFrame on point 6. Check Performance\n\n8. Run the Model on the entire Dataset. Check Performance based on my 150 instances.\n\nAt the end, every comment has a label produced by my Classification Model\n\n'

In [5]:
trump_podcasts = [
    "xrFdHO7FH8w",
    "blqIZGXWUpU",
    "s11uIW7wi-E",
    "vC5cHjcgt5g",
    "G80iLTctFuY",
    "qCbfTN-caFI",
    "Ry1IjOft95c",
    "S7BTdUaNQM8",
    "1SsmPh8gCxU",
    "-dmwG54QsKc",
    "nwQil7tcImI",
    "G9lXnwuZ2qs",
    "hBMoPUAeLnY"
]

harris_podcasts = [
    "_KCRsjPCiCI",
    "bzThwqnQJDY",
    "7L4sts7I3xI",
    "pNbwMrBMGgE",
    "Vu5yD3fu6A8",
]

youtube_db = db.SQLiteYoutubeSaver(db_name='../db/youtube.db')

In [6]:
youtube_db.cursor.execute(f"""
SELECT content, video_id, bias, leaning, is_political, sentiment, emotion, llm_label
FROM CommentAnalysis JOIN Comments on Comments.id = CommentAnalysis.id
""")

data = youtube_db.cursor.fetchall()

df = pd.DataFrame(data)
df.columns = ['content', 'video_id', 'bias', 'leaning', 'is_political', 'sentiment', 'emotion', 'llm_label']
df['podcast_guest'] = ['Trump' if video_id in trump_podcasts else 'Harris' for video_id in df['video_id']]
df.drop('video_id', axis=1, inplace=True)

In [7]:
df

Unnamed: 0,content,bias,leaning,is_political,sentiment,emotion,llm_label,podcast_guest
0,Best Podcast of All-time? 🤔🔥,RIGHT,RIGHT,0,Positive,joy,Republican,Trump
1,Definitely my favorite!,RIGHT,LEFT,0,Positive,joy,Republican,Trump
2,"No, best guest",RIGHT,RIGHT,0,Positive,joy,Republican,Trump
3,Probably,RIGHT,LEFT,0,Neutral,anticipation,Republican,Trump
4,Under 30 seconds,RIGHT,RIGHT,0,Neutral,anticipation,Neutral,Trump
...,...,...,...,...,...,...,...,...
114695,How do you check the numbers from your phone ...,RIGHT,RIGHT,0,Negative,disgust,,Harris
114696,@@BruceDragon-sf1tr just google it fam. I prom...,LEFT,RIGHT,0,Neutral,optimism,,Harris
114697,its now 34k to 88k now lolol,LEFT,LEFT,0,Positive,joy,,Harris
114698,@@BruceDragon-sf1trjust add it to your browser...,LEFT,RIGHT,0,Neutral,anger,,Harris


In [8]:
df_with_llm_labels = df[df["llm_label"].notna()].copy()

In [9]:
df_with_llm_labels.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52451 entries, 0 to 93638
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   content        52451 non-null  object
 1   bias           52451 non-null  object
 2   leaning        52451 non-null  object
 3   is_political   52451 non-null  int64 
 4   sentiment      52451 non-null  object
 5   emotion        52451 non-null  object
 6   llm_label      52451 non-null  object
 7   podcast_guest  52451 non-null  object
dtypes: int64(1), object(7)
memory usage: 3.6+ MB


In [10]:
df_with_llm_labels

Unnamed: 0,content,bias,leaning,is_political,sentiment,emotion,llm_label,podcast_guest
0,Best Podcast of All-time? 🤔🔥,RIGHT,RIGHT,0,Positive,joy,Republican,Trump
1,Definitely my favorite!,RIGHT,LEFT,0,Positive,joy,Republican,Trump
2,"No, best guest",RIGHT,RIGHT,0,Positive,joy,Republican,Trump
3,Probably,RIGHT,LEFT,0,Neutral,anticipation,Republican,Trump
4,Under 30 seconds,RIGHT,RIGHT,0,Neutral,anticipation,Neutral,Trump
...,...,...,...,...,...,...,...,...
93634,Curious why she didn’t get called out for not ...,LEFT,RIGHT,1,Negative,anger,Democratic,Harris
93635,"She already thinks she has won. ""When I was Vi...",RIGHT,RIGHT,1,Negative,anticipation,Republican,Harris
93636,Technically she can be the president if she us...,RIGHT,RIGHT,1,Neutral,anticipation,Republican,Harris
93637,That's what people need to realize. If she was...,LEFT,RIGHT,1,Negative,anger,Democratic,Harris


In [11]:
# Label Space
REPUBLICAN = 0
DEMOCRATIC = 1
NEUTRAL = 2
ABSTAIN = -1

In [19]:
@labeling_function()
def lf_llm(x):
    match x.llm_label:
        case "Republican":
            return REPUBLICAN
        case "Democratic":
            return DEMOCRATIC
        case "Neutral":
            return NEUTRAL
        case _:
            return ABSTAIN


@labeling_function()
def lf_bias_left(x):
    match x.bias:
        case "LEFT":
            return DEMOCRATIC
        case "RIGHT":
            return REPUBLICAN if x.llm_label == "Republican" else ABSTAIN
    return ABSTAIN


@labeling_function()
def lf_leaning(x):
    if x.llm_label == "Neutral": return NEUTRAL
    match x.leaning:
        case "RIGHT":
            return REPUBLICAN
        case "LEFT":
            return DEMOCRATIC
    return ABSTAIN


@labeling_function()
def lf_not_political(x):
    return NEUTRAL if x.is_political == 0 else ABSTAIN


@labeling_function()
def lf_negative_sentiment(x):
    match x.sentiment:
        case "Negative":
            return DEMOCRATIC if x.podcast_guest == "Trump" else REPUBLICAN
        case "Positive":
            return REPUBLICAN if x.podcast_guest == "Trump" else DEMOCRATIC
    return ABSTAIN


@labeling_function()
def lf_emotion(x):
    match x.emotion:
        case "joy" | "surprise":
            return REPUBLICAN if x.podcast_guest == "Trump" else DEMOCRATIC
        case "anger" | "disgust":
            return DEMOCRATIC if x.podcast_guest == "Trump" else REPUBLICAN
    return ABSTAIN


@labeling_function()
def lf_bias_sentiment(x):
    match (x.bias, x.sentiment):
        case ("RIGHT", "Positive"): return REPUBLICAN
        case ("LEFT", "Positive"): return DEMOCRATIC
        case ("RIGHT", "Negative"): return DEMOCRATIC
        case ("LEFT", "Negative"): return REPUBLICAN
    return ABSTAIN



In [21]:
lfs = [
    lf_llm,
    lf_bias_left,
    lf_leaning,
    lf_not_political,
    lf_negative_sentiment,
    lf_emotion,
    lf_bias_sentiment,
]

applier = PandasLFApplier(lfs)
L = applier.apply(df_with_llm_labels)

100%|██████████| 52451/52451 [00:04<00:00, 10993.17it/s]


In [22]:
LFAnalysis(L, lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_llm,0,"[0, 1, 2]",1.0,0.999847,0.890717
lf_bias_left,1,"[0, 1]",0.538674,0.538674,0.512345
lf_leaning,2,"[0, 1, 2]",0.976473,0.976473,0.870889
lf_not_political,3,[2],0.44022,0.44022,0.418581
lf_negative_sentiment,4,"[0, 1]",0.9078,0.9078,0.834093
lf_emotion,5,"[0, 1]",0.730815,0.730815,0.671636
lf_bias_sentiment,6,"[0, 1]",0.815027,0.815027,0.756535


In [23]:
label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_train=L, n_epochs=500, log_freq=100)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=4.694]
 15%|█▌        | 75/500 [00:00<00:00, 746.38epoch/s]INFO:root:[100 epochs]: TRAIN:[loss=0.077]
 31%|███       | 156/500 [00:00<00:00, 775.47epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.074]
 49%|████▊     | 243/500 [00:00<00:00, 814.03epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.074]
 67%|██████▋   | 334/500 [00:00<00:00, 848.14epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.074]
100%|██████████| 500/500 [00:00<00:00, 848.94epoch/s]
INFO:root:Finished Training


In [24]:
snorkel_labels = label_model.predict(L)

In [25]:
df_with_llm_labels["snorkel_labels"] = snorkel_labels

In [42]:
confidence_levels = np.max(label_model.predict_proba(L), axis=1)
df_with_llm_labels["confidence"] = confidence_levels
print(np.count_nonzero(confidence_levels))
print(np.count_nonzero(confidence_levels >= 0.9))

52451
44788


In [43]:
df_with_llm_labels

Unnamed: 0,content,bias,leaning,is_political,sentiment,emotion,llm_label,podcast_guest,snorkel_labels,confidence
0,Best Podcast of All-time? 🤔🔥,RIGHT,RIGHT,0,Positive,joy,Republican,Trump,0,0.999090
1,Definitely my favorite!,RIGHT,LEFT,0,Positive,joy,Republican,Trump,0,0.998202
2,"No, best guest",RIGHT,RIGHT,0,Positive,joy,Republican,Trump,0,0.999090
3,Probably,RIGHT,LEFT,0,Neutral,anticipation,Republican,Trump,0,0.893571
4,Under 30 seconds,RIGHT,RIGHT,0,Neutral,anticipation,Neutral,Trump,2,0.999915
...,...,...,...,...,...,...,...,...,...,...
93634,Curious why she didn’t get called out for not ...,LEFT,RIGHT,1,Negative,anger,Democratic,Harris,0,0.963336
93635,"She already thinks she has won. ""When I was Vi...",RIGHT,RIGHT,1,Negative,anticipation,Republican,Harris,0,0.999291
93636,Technically she can be the president if she us...,RIGHT,RIGHT,1,Neutral,anticipation,Republican,Harris,0,0.862123
93637,That's what people need to realize. If she was...,LEFT,RIGHT,1,Negative,anger,Democratic,Harris,0,0.963336
