In [1]:
# Fundamentals
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords # Import the stop word list
import matplotlib.pyplot as plt
import seaborn as sns
import time

# snorkel specific imports
from snorkel.labeling import labeling_function
from textblob import TextBlob
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

import random

import nltk
from nltk.corpus import wordnet as wn

from snorkel.augmentation import transformation_function

nltk.download("wordnet", quiet=True)

from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier


In [12]:
misleading_bias_terms = ['trump', 'u', 'america', 'american', 'new', 'people', 'states', 'president', 'many', 'states', 'united', 'americans', 'one']

In [13]:
@labeling_function()
def lf_keyword_my_binary(x):
    """Return 1 if any of the misleading_bias_terms is present, else return 0."""
    presence = any(term in str(x).lower() for term in misleading_bias_terms)
    return 1 if presence else 0

In [14]:
bias_words = ['fake', 'news', 'media','biased', 'unreliable', 'propaganda', 'misleading', 'partisan', 'manipulative']


In [15]:
@labeling_function()
def lf_regex_fake_news_binary(x):
    """Return 1 if any of the bias_words is present, else return 0."""
    presence = any(re.search(fr"\b{word}\b", str(x), flags=re.I) is not None for word in bias_words)
    return 1 if presence else 0

In [16]:
subj_words = ['feel', 'feels', 'thinks','thought', 'thoughts','opinion', 'bias', 'think', 'felt', 'believe','believed','believes','believer']


In [17]:
@labeling_function()
def lf_regex_subjective_binary(x):
    """Return 1 if any of the subj_words is present, else return 0."""
    presence = any(re.search(fr"\b{word}\b", str(x), flags=re.I) is not None for word in subj_words)
    return 1 if presence else 0

In [18]:
@labeling_function()
def lf_long_combined_text_binary(text_list):
    """Return 1 if the combined length is greater than 376, else return 0."""
    length = len(" ".join(str(text_list)).split())
    return 1 if length > 376 else 0

In [19]:
@labeling_function()
def lf_textblob_polarity_binary(x):
    """
    We use a third-party sentiment classification model, TextBlob.

    We map the polarity to binary classification: 1 if negative, 0 otherwise.
    """
    polarity = TextBlob(str(x)).sentiment.polarity
    return 1 if polarity < 0 else 0

In [20]:
@labeling_function()
def lf_textblob_subjectivity_binary(x):
    """
    We use a third-party sentiment classification model, TextBlob.

    We map the subjectivity to binary classification: 1 if high subjectivity, 0 otherwise.
    """
    subjectivity = TextBlob(str(x)).sentiment.subjectivity
    return 1 if subjectivity > 0.5 else 0

In [21]:
df_train = pd.read_csv('/Users/ben/Desktop/DSI_GA_Materials/capstone/Capstone_Project_backup/Data/unlabeled_train.csv')


In [22]:
# Define weights for each binary labeling function
weight_lf_keyword_my_binary = 0.2
weight_lf_regex_fake_news_binary = 0.1
weight_lf_regex_subjective_binary = 0.1
weight_lf_long_combined_text_binary = 0.2
weight_lf_textblob_polarity_binary = 0.1
weight_lf_textblob_subjectivity_binary = 0.3

@labeling_function()
def combined_binary_bias_score(x):
    """Combine binary labeling functions into a linear equation."""
    lf1_score = lf_keyword_my_binary(x) * weight_lf_keyword_my_binary
    lf2_score = lf_regex_fake_news_binary(x) * weight_lf_regex_fake_news_binary
    lf3_score = lf_regex_subjective_binary(x) * weight_lf_regex_subjective_binary
    lf4_score = lf_long_combined_text_binary(x) * weight_lf_long_combined_text_binary
    lf5_score = lf_textblob_polarity_binary(x) * weight_lf_textblob_polarity_binary
    lf6_score = lf_textblob_subjectivity_binary(x) * weight_lf_textblob_subjectivity_binary

    # Combine scores with weights
    combined_score = lf1_score + lf2_score + lf3_score + lf4_score + lf5_score + lf6_score

    # Normalize to the range [0, 1]
    normalized_score = max(0, min(combined_score, 1))

    return normalized_score


In [23]:
df_train["cohen_kappa_label"] = df_train["abstracts_headlines"].apply(combined_binary_bias_score)


In [24]:
df_train["cohen_kappa_label"].value_counts(normalize=True)

cohen_kappa_label
0.2    0.482409
0.5    0.250343
0.3    0.170178
0.6    0.065367
0.4    0.010780
0.0    0.010241
0.3    0.003920
0.7    0.003724
0.1    0.002940
0.6    0.000049
0.8    0.000049
Name: proportion, dtype: float64

In [25]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
labels = ['none_to_slight', 'fair', 'moderate', 'substantial', 'almost_perfect']

# Fill NaN values with a specific value (e.g., -1)
df_train['cohen_kappa_label'] = df_train['cohen_kappa_label'].fillna(-1)

# Use get_dummies with dummy_na parameter
df_dummies = pd.get_dummies(pd.cut(df_train['cohen_kappa_label'], bins=bins, labels=labels), dummy_na=True,
                             prefix='cohen_kappa_range', dtype=int)

# Concatenate the dummy columns to the original DataFrame
df_train = pd.concat([df_train, df_dummies], axis=1)


In [26]:
df_train


Unnamed: 0.1,Unnamed: 0,abstracts_headlines,section_name,cohen_kappa_label,cohen_kappa_range_none_to_slight,cohen_kappa_range_fair,cohen_kappa_range_moderate,cohen_kappa_range_substantial,cohen_kappa_range_almost_perfect,cohen_kappa_range_nan
0,0,new start treaty nearly wrapped president obam...,1,0.2,1,0,0,0,0,0
1,1,response plea israel release jonathan pollard ...,1,0.0,0,0,0,0,0,1
2,2,letters editor regarding tensions new york med...,1,0.2,1,0,0,0,0,0
3,3,canada beat united states semifinals world jun...,0,0.5,0,0,1,0,0,0
4,4,kristine lilly holds world record internationa...,0,0.2,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
20403,20403,job market remained strong last month suggesti...,0,0.2,1,0,0,0,0,0
20404,20404,recent poll data suggests rising prices domina...,0,0.2,1,0,0,0,0,0
20405,20405,scientists keeping eye ba one three geneticall...,0,0.2,1,0,0,0,0,0
20406,20406,twenty one republican state attorneys general ...,0,0.2,1,0,0,0,0,0


In [27]:
df_train.to_csv('/Users/ben/Desktop/DSI_GA_Materials/capstone/Capstone_Project_backup/Data/kappa_labeled_train.csv')