In [1]:
# Fundamentals
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords # Import the stop word list
import matplotlib.pyplot as plt
import time

In [36]:
# snorkel specific imports
from snorkel.labeling import labeling_function
from textblob import TextBlob
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

In [62]:
# Define the label mappings for convenience
'''
0.01–0.20 as none to slight bias
0.21–0.40 as fair bias
0.41– 0.60 as moderate bias
0.61–0.80 as substantial bias
0.81–1.00 as perfectly bias 

'''
abstain = -1 # cohen kappa is not lower than 0 
none_to_slight = 0.20
fair = 0.40
moderate = 0.60
substantial = 0.80
perfect = 1.0 

In [4]:
misleading_bias_terms = ['trump', 'u', 'america', 'american', 'new', 'people', 'states', 'president', 'many', 'states', 'united', 'americans', 'one']

In [61]:
@labeling_function()
def lf_keyword_my(x):
    """Label based on the number of hits in the misleading_bias_terms list. 
    These words were throwing the most errors in the classification models (false positives/negatives)."""
    count_hits = sum(term in x.lower() for term in misleading_bias_terms)

    if count_hits == 0:
        return abstain
    elif count_hits <= 1:
        return none_to_slight
    elif count_hits <= 2:
        return fair
    elif count_hits <= 3:
        return moderate
    elif count_hits <= 4:
        return substantial
    else:
        return perfect

In [44]:
lf_keyword_my('Trump is the new dictator of the United States of America')

1.0

In [6]:
bias_words = ['fake', 'news', 'media','biased', 'unreliable', 'propaganda', 'misleading', 'partisan', 'manipulative']


In [60]:
@labeling_function()
def lf_regex_fake_news(x):
    """Label based on the number of hits in the bias_words list. More hits means more biased. 
    Paradoxically, abstracts and headlines that include these terms tend to be more biased. """
    count_hits = sum(re.search(fr"\b{word}\b", x, flags=re.I) is not None for word in bias_words)

    if count_hits == 0:
        return abstain
    elif count_hits <= 1:
        return none_to_slight
    elif count_hits <= 2:
        return fair
    elif count_hits <= 3:
        return moderate
    elif count_hits <= 4:
        return substantial
    else:
        return perfect

In [42]:
lf_regex_fake_news('Fake news media is leading america down a dark path')

0.6

In [59]:
@labeling_function()
def lf_long_combined_text(text_list):
    """Categorize based on combined length of words in abstracts and headlines. Longer abstracts and titles that tend to ramble tend to be more biased."""
    length = len(" ".join(text_list).split())

    return (
        abstain if length < 19 else
        none_to_slight if length < 56 else
        fair if length < 133 else
        moderate if length < 376 else
        substantial if length < 615 else
        perfect
    )

In [39]:
lf_long_combined_text('America is headed down an incredibly dark path if it continues to support genocide')

0.4

In [55]:
@labeling_function()
def lf_textblob_polarity(x):
    """
    We use a third-party sentiment classification model, TextBlob.

    We map the polarity to the specified bias categories with heuristic that negative abstracts/headlines tend to contain more bias. 
    """
    polarity = TextBlob(x).sentiment.polarity

    return (
        ABSTAIN if polarity > 0.3 else
        none_to_slight if polarity > 0.0 else
        fair if polarity > -0.2 else
        moderate if polarity > -0.4 else
        substantial if polarity > -0.6 else
        perfect
    )

In [56]:
lf_textblob_polarity("Textblob is terrible to use. What horror.")

1.0

In [53]:
testimonial = TextBlob("Textblob is terrible to use. What horror.")
testimonial.sentiment
testimonial.sentiment.polarity

-1.0

#### Testing the Model

In [63]:
df = pd.read_csv('../part_01/Data/united_states_headlines_last_decade.csv')


FileNotFoundError: [Errno 2] No such file or directory: '../part_01/Data/united_states_headlines_last_decade.csv'