In [2]:
# Fundamentals
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords # Import the stop word list
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Modeling Fundamentals
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
from sklearn.metrics import make_scorer

# Models
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# Evaluation metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score

# Ignore Future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning);



In [1]:
# snorkel specific imports
from snorkel.labeling import labeling_function
from textblob import TextBlob
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.preprocess import preprocessor
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling.model import LabelModel
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.utils import probs_to_preds
from snorkel.labeling import LabelingFunction
from snorkel.analysis import get_label_buckets


import random

import nltk
from nltk.corpus import wordnet as wn

from snorkel.augmentation import transformation_function

nltk.download("wordnet", quiet=True)

from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier


In [1786]:
pd.set_option('display.max_colwidth', None)

### Original Labeling Functions

In [1787]:
ABSTAIN = -1
BIASED = 1
UNBIASED = 0

### These labeling functions were a result of false positives/negatives from Classification model in part I (list titled 'misleading bias terms'), and a priori assumptions regarding bias by the author ('bias words' and 'subj words').

In [1788]:
# with only these three lists highest score of 76 % achieved

misleading_bias_terms = ['trump', 'u', 'america', 'american', 'new', 'people', 'states', 'president', 'many', 'states', 'united', 'americans', 'one']
bias_words = ['fake', 'news', 'media','biased', 'unreliable', 'propaganda', 'misleading', 'partisan', 'manipulative']
subj_words = ['feel', 'feels', 'thinks','thought', 'thoughts','opinion', 'bias', 'think', 'felt', 'believe','believed','believes','believer']

# negative commonly used words (resulted in worse score 54 %)
# past_tense_keywords = ["hurt", "blamed", "harmed", "accused"]
# present_tense_keywords = ["hurts", "blames", "harms", "accuses"]
# active_voice_keywords = ["hurting", "blaming", "harming", "accusing"]

# Commonly used positive words in past tense
past_tense_keywords = ["Succeeded", "Achieved", "Excelled", "Innovated"]

# Commonly used positive words in present tense
present_tense_keywords = ["Succeeds", "Achieves", "Excels", "Innovates"]

# Commonly used positive words in active voice
active_voice_keywords = ["Succeeding", "Achieving", "Excelling", "Innovating"]

OLD

In [1789]:
@labeling_function()
def lf_keyword_my_binary(x):
    """Return 1 if any of the misleading_bias_terms is present, else return 0."""
    presence = any(term in str(x).lower() for term in misleading_bias_terms)
    return 1 if presence else 0

NEW

In [1790]:
@labeling_function()
def keyword_my_mislead(x):
    """Return 1 if any of the misleading_bias_terms is present, else return 0."""
    presence = any(term in str(x).lower() for term in misleading_bias_terms)
    return BIASED if presence else ABSTAIN

OLD

In [1791]:
@labeling_function()
def lf_regex_fake_news_binary(x):
    """Return 1 if any of the bias_words is present, else return 0."""
    presence = any(re.search(fr"\b{word}\b", str(x), flags=re.I) is not None for word in bias_words)
    return 1 if presence else 0

NEW

In [1792]:
@labeling_function()
def regex_fake_news(x):
    """Return 1 if any of the bias_words is present, else return 0."""
    presence = any(re.search(fr"\b{word}\b", str(x), flags=re.I) is not None for word in bias_words)
    return BIASED if presence else ABSTAIN

OLD

In [1793]:
@labeling_function()
def lf_regex_subjective_binary(x):
    """Return 1 if any of the subj_words is present, else return 0."""
    presence = any(re.search(fr"\b{word}\b", str(x), flags=re.I) is not None for word in subj_words)
    return 1 if presence else 0

NEW

In [1794]:
@labeling_function()
def regex_subjective(x):
    """Return BIAS if any of the subj_words is present, else ABSTAIN."""
    presence = any(re.search(fr"\b{word}\b", str(x), flags=re.I) is not None for word in subj_words)
    return BIASED if presence else ABSTAIN

In [1795]:
# @labeling_function()
# def past_tense_keywords_f(x):
#     """Return BIAS if any of the subj_words is present, else ABSTAIN."""
#     presence = any(re.search(fr"\b{word}\b", str(x), flags=re.I) is not None for word in past_tense_keywords)
#     return BIASED if presence else ABSTAIN

In [1796]:
# @labeling_function()
# def present_tense_keywords_f(x):
#     """Return BIAS if any of the subj_words is present, else ABSTAIN."""
#     presence = any(re.search(fr"\b{word}\b", str(x), flags=re.I) is not None for word in present_tense_keywords)
#     return BIASED if presence else ABSTAIN

In [1797]:
# @labeling_function()
# def active_voice_keywords_f(x):
#     """Return BIAS if any of the subj_words is present, else ABSTAIN."""
#     presence = any(re.search(fr"\b{word}\b", str(x), flags=re.I) is not None for word in active_voice_keywords)
#     return BIASED if presence else ABSTAIN

### More labeling functions after clustering with CBOW and Skip Grams - See documentation for more details

In [1863]:
import re

keywords = [
    "maps", "county", "election", "coronavirus", "case",
    "risk", "cases", "covid", "latest", "trump",
    "ukraine", "russia", "war", "reminiscent",
    "removes", "proceed", "ponder"
]

keywords_pattern = "|".join(fr"\b{re.escape(keyword)}\b" for keyword in keywords)

@labeling_function()
def lf_keyword_maps_binary(x):
    return 1 if re.search(fr"\b{re.escape('maps')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_county_binary(x):
    return 1 if re.search(fr"\b{re.escape('county')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_election_binary(x):
    return 1 if re.search(fr"\b{re.escape('election')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_coronavirus_binary(x):
    return 1 if re.search(fr"\b{re.escape('coronavirus')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_case_binary(x):
    return 1 if re.search(fr"\b{re.escape('case')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_risk_binary(x):
    return 1 if re.search(fr"\b{re.escape('risk')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_cases_binary(x):
    return 1 if re.search(fr"\b{re.escape('cases')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_covid_binary(x):
    return 1 if re.search(fr"\b{re.escape('covid')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_latest_binary(x):
    return 1 if re.search(fr"\b{re.escape('latest')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_trump_binary(x):
    return 1 if re.search(fr"\b{re.escape('trump')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_ukraine_binary(x):
    return 1 if re.search(fr"\b{re.escape('ukraine')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_russia_binary(x):
    return 1 if re.search(fr"\b{re.escape('russia')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_war_binary(x):
    return 1 if re.search(fr"\b{re.escape('war')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_reminiscent_binary(x):
    return 1 if re.search(fr"\b{re.escape('reminiscent')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_removes_binary(x):
    return 1 if re.search(fr"\b{re.escape('removes')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_proceed_binary(x):
    return 1 if re.search(fr"\b{re.escape('proceed')}\b", str(x), flags=re.I) else 0

@labeling_function()
def lf_keyword_ponder_binary(x):
    return 1 if re.search(fr"\b{re.escape('ponder')}\b", str(x), flags=re.I) else 0


### Length Based Functions

In [1798]:
@labeling_function()
def lf_long_combined_text_binary(text_list):
    """Return 1 if the combined length is greater than 376, else return 0."""
    length = len(" ".join(str(text_list)).split())
    return 1 if length > 376 else 0

In [1799]:
@labeling_function()
def long_combined_text(x):
    """Return BIAS if the combined length is greater than 376, else ABSTAIN."""
    return BIASED if len(x.text.split()) > 376 else ABSTAIN

### Sentiment : Polarity and Subjectivity 
More biased abstracts/headlines will be more subjective, and on average positive polarity (indicating good feelings / thoughts) will be more biased. 

In [1800]:
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x

In [1801]:
@labeling_function()
def lf_textblob_polarity_binary(x):
    """
    We use a third-party sentiment classification model, TextBlob.

    We map the polarity to binary classification: 1 if negative, 0 otherwise.
    """
    polarity = TextBlob(str(x)).sentiment.polarity
    return 1 if polarity > 0 else 0

In [1802]:
@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return BIASED if x.polarity > 0.9 else ABSTAIN

In [1803]:
@labeling_function()
def lf_textblob_subjectivity_binary(x):
    """
    We use a third-party sentiment classification model, TextBlob.

    We map the subjectivity to binary classification: 1 if high subjectivity, 0 otherwise.
    """
    subjectivity = TextBlob(str(x)).sentiment.subjectivity
    return 1 if subjectivity > 0.5 else 0

In [1804]:
@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return BIASED if x.subjectivity >= 0.5 else ABSTAIN

### More Keywords (Adapted from Snorkel Tutorial)

In [1805]:
def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN


def make_keyword_lf(keywords, label=BIASED):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )


"""Biased articles have accusatory language"""
keyword_hurt = make_keyword_lf(keywords=["hurt, hurts, hurting"])

"""Biased articles have accusatory language"""
keyword_blame = make_keyword_lf(keywords=["blamed, blames, blaming"])

"""Biased articles have accusatory language."""
keyword_harm = make_keyword_lf(keywords=["harmed, harms, harming"])

"""Biased articles have accusatory language"""
keyword_accuse = make_keyword_lf(keywords=["accused", "accuses, accusing"])

"""Biased articles have flowery language"""
keyword_succeeded = make_keyword_lf(keywords=["succeeds, succeeded, succeeding"])

"""Biased articles have flowery language"""
keyword_achieves = make_keyword_lf(keywords=["achieves, achieved, achieving"])

"""Biased articles have flowery language"""
keyword_excels = make_keyword_lf(keywords=["excels, excelled, excelling"])

"""Words similar to president found with continuous bag of word"""
keyword_hypocrisy = make_keyword_lf(keywords=["weakened, two, emily, code"])


"""Unbiased articles have see latest charts."""
keyword_maps = make_keyword_lf(keywords=["maps"], label=UNBIASED)
keyword_county = make_keyword_lf(keywords=["county"], label=UNBIASED)
keyword_election = make_keyword_lf(keywords=["election"], label=UNBIASED)
keyword_coronavirus = make_keyword_lf(keywords=["coronavirus"], label=UNBIASED)
keyword_case = make_keyword_lf(keywords=["case"], label=UNBIASED)
keyword_risk = make_keyword_lf(keywords=["risk"], label=UNBIASED)
keyword_cases = make_keyword_lf(keywords=["cases"], label=UNBIASED)
keyword_covid = make_keyword_lf(keywords=["covid"], label=UNBIASED)
keyword_latest = make_keyword_lf(keywords=["latest"], label=UNBIASED)

# New biased keywords
keyword_trump = make_keyword_lf(keywords=["trump"], label=BIASED)
keyword_ukraine = make_keyword_lf(keywords=["ukraine"], label=BIASED)
keyword_russia = make_keyword_lf(keywords=["russia"], label=BIASED)
keyword_war = make_keyword_lf(keywords=["war"], label=BIASED)

keyword_reminiscent = make_keyword_lf(keywords=["reminiscent"], label=BIASED)
keyword_removes = make_keyword_lf(keywords=["removes"], label=BIASED)
keyword_proceed = make_keyword_lf(keywords=["proceed"], label=BIASED)
keyword_ponder = make_keyword_lf(keywords=["ponder"], label=BIASED)


"""Unbiased articles contain maps, county, results, election, coronavirus, case, risk, cases, covid, latest."""


'Unbiased articles contain maps, county, results, election, coronavirus, case, risk, cases, covid, latest.'

In [1806]:
'''
# Commonly used positive words in past tense
past_tense_keywords = ["Succeeded", "Achieved", "Excelled", "Innovated"]

# Commonly used positive words in present tense
present_tense_keywords = ["Succeeds", "Achieves", "Excels", "Innovates"]

# Commonly used positive words in active voice
active_voice_keywords = ["Succeeding", "Achieving", "Excelling", "Innovating"]

keyword_succeeded,
keyword_achieves,
keyword_excels,

keyword_hurt
keyword_hurts
keyword_hurting
keyword_blamed
keyword_blames
keyword_blaming
keyword_harmed
keyword_harms
keyword_harming
keyword_accused
keyword_accuses
keyword_accusing
'''

# Common nonbiased words: 
'''
keyword_maps,
keyword_county,
keyword_election,
keyword_coronavirus,
keyword_case,
keyword_risk,
keyword_cases,
keyword_covid,
keyword_latest,
'''

'\nkeyword_maps,\nkeyword_county,\nkeyword_election,\nkeyword_coronavirus,\nkeyword_case,\nkeyword_risk,\nkeyword_cases,\nkeyword_covid,\nkeyword_latest,\n'

### Load in Data


In [1807]:
df_train = pd.read_csv('/Users/ben/Desktop/DSI_GA_Materials/capstone/Capstone_Project_backup/Data/unlabeled_train.csv')


In [1808]:
df_test = pd.read_csv('/Users/ben/Desktop/DSI_GA_Materials/capstone/Capstone_Project_backup/Data/kappa_labeled_train.csv')

In [1809]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,abstracts_headlines,section_name
0,0,new start treaty nearly wrapped president obama begin discussing far reaching deals russia next treaties,1
1,1,response plea israel release jonathan pollard plea pollard release,1
2,2,letters editor regarding tensions new york medical schools caribbean doctors learn best,1
3,3,canada beat united states semifinals world junior tournament advancing final th consecutive year canada beats united states world junior semifinal,0
4,4,kristine lilly holds world record international appearances u announced retirement wednesday lilly iron lady u soccer retires,0


In [1810]:
df_test.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,abstracts_headlines,section_name,cohen_kappa_label,cohen_kappa_range_none_to_slight,cohen_kappa_range_fair,cohen_kappa_range_moderate,cohen_kappa_range_substantial,cohen_kappa_range_almost_perfect,cohen_kappa_range_nan
0,0,0,new start treaty nearly wrapped president obama begin discussing far reaching deals russia next treaties,1,0.2,1,0,0,0,0,0
1,1,1,response plea israel release jonathan pollard plea pollard release,1,0.0,0,0,0,0,0,1
2,2,2,letters editor regarding tensions new york medical schools caribbean doctors learn best,1,0.2,1,0,0,0,0,0
3,3,3,canada beat united states semifinals world junior tournament advancing final th consecutive year canada beats united states world junior semifinal,0,0.5,0,0,1,0,0,0
4,4,4,kristine lilly holds world record international appearances u announced retirement wednesday lilly iron lady u soccer retires,0,0.2,1,0,0,0,0,0


In [1811]:
df_test = df_test.drop(columns=['Unnamed: 0.1','Unnamed: 0','cohen_kappa_label','cohen_kappa_range_none_to_slight',	'cohen_kappa_range_fair','cohen_kappa_range_moderate','cohen_kappa_range_substantial','cohen_kappa_range_almost_perfect','cohen_kappa_range_nan'])

In [1812]:
df_test

Unnamed: 0,abstracts_headlines,section_name
0,new start treaty nearly wrapped president obama begin discussing far reaching deals russia next treaties,1
1,response plea israel release jonathan pollard plea pollard release,1
2,letters editor regarding tensions new york medical schools caribbean doctors learn best,1
3,canada beat united states semifinals world junior tournament advancing final th consecutive year canada beats united states world junior semifinal,0
4,kristine lilly holds world record international appearances u announced retirement wednesday lilly iron lady u soccer retires,0
...,...,...
20403,job market remained strong last month suggesting workers remain high demand still quitting often labor department report showed workers quit february job openings stayed high,0
20404,recent poll data suggests rising prices dominate americans economic concerns inflation fears highest since americans especially republicans getting worried inflation,0
20405,scientists keeping eye ba one three genetically distinct varieties omicron variant coronavirus ba subvariant omicron accounts half new u coronavirus cases c c estimates,0
20406,twenty one republican state attorneys general filed lawsuit biden administration tuesday block federal mask mandate public transportation twenty one states file lawsuit block mask mandate public transportation,0


In [1813]:
df_test.rename(columns={'abstracts_headlines': 'text'}, inplace=True)


In [1814]:
# Assuming 'abstracts_headlines' is the current column name
df_train.rename(columns={'abstracts_headlines': 'text'}, inplace=True)



In [1815]:
df_train

Unnamed: 0.1,Unnamed: 0,text,section_name
0,0,new start treaty nearly wrapped president obama begin discussing far reaching deals russia next treaties,1
1,1,response plea israel release jonathan pollard plea pollard release,1
2,2,letters editor regarding tensions new york medical schools caribbean doctors learn best,1
3,3,canada beat united states semifinals world junior tournament advancing final th consecutive year canada beats united states world junior semifinal,0
4,4,kristine lilly holds world record international appearances u announced retirement wednesday lilly iron lady u soccer retires,0
...,...,...,...
20403,20403,job market remained strong last month suggesting workers remain high demand still quitting often labor department report showed workers quit february job openings stayed high,0
20404,20404,recent poll data suggests rising prices dominate americans economic concerns inflation fears highest since americans especially republicans getting worried inflation,0
20405,20405,scientists keeping eye ba one three genetically distinct varieties omicron variant coronavirus ba subvariant omicron accounts half new u coronavirus cases c c estimates,0
20406,20406,twenty one republican state attorneys general filed lawsuit biden administration tuesday block federal mask mandate public transportation twenty one states file lawsuit block mask mandate public transportation,0


In [1816]:
df_train = df_train.iloc[:len(df_train)//2, :]

In [1817]:
df_train

Unnamed: 0.1,Unnamed: 0,text,section_name
0,0,new start treaty nearly wrapped president obama begin discussing far reaching deals russia next treaties,1
1,1,response plea israel release jonathan pollard plea pollard release,1
2,2,letters editor regarding tensions new york medical schools caribbean doctors learn best,1
3,3,canada beat united states semifinals world junior tournament advancing final th consecutive year canada beats united states world junior semifinal,0
4,4,kristine lilly holds world record international appearances u announced retirement wednesday lilly iron lady u soccer retires,0
...,...,...,...
10199,10199,syrian refugee family faces life president trump stay go,1
10200,10200,many republicans especially senate increasingly see need tread carefully even alter course push repeal affordable care act really sick really scared voters temper action health law,0
10201,10201,accusation president offered evidence sets another spasm surrounding young administration gleaned trump allegations wiretapping,0
10202,10202,aarp allies objecting house committees plan vote republican bill could leave people large premium increases repeal health law faces new hurdle older americans,0


In [1818]:

df_test = df_test.iloc[len(df_test)//2:, :]


In [1819]:
df_test

Unnamed: 0,text,section_name
10204,thinking drink carefully eat lead much better bottles table want pick better bottles repeat wine food,0
10205,trump administration faces problem china high trade barriers allowed world trade organization china entered group developing country insists still one building trade walls,0
10206,attitude toward education democracy business quality life helped european nation top list united states slipped best country world survey says switzerland,0
10207,falling france shebelieves cup americans left ponder showing scored one goal three matches u women soccer team first back back losses three years,0
10208,study journal archives sexual behavior found americans sex average nine fewer times years ago americans less sex,0
...,...,...
20403,job market remained strong last month suggesting workers remain high demand still quitting often labor department report showed workers quit february job openings stayed high,0
20404,recent poll data suggests rising prices dominate americans economic concerns inflation fears highest since americans especially republicans getting worried inflation,0
20405,scientists keeping eye ba one three genetically distinct varieties omicron variant coronavirus ba subvariant omicron accounts half new u coronavirus cases c c estimates,0
20406,twenty one republican state attorneys general filed lawsuit biden administration tuesday block federal mask mandate public transportation twenty one states file lawsuit block mask mandate public transportation,0


In [1820]:
sampled_data = df_train.sample(n=1000, random_state=42)

In [1821]:
sampled_data.to_csv('/Users/ben/Desktop/DSI_GA_Materials/capstone/Capstone_Project_backup/Data/sampled_train.csv')

In [1822]:
sampled_data.dtypes

Unnamed: 0       int64
text            object
section_name     int64
dtype: object

In [1823]:
sampled_test = df_test.sample(n=100, random_state=42)

In [1824]:
sampled_test.reset_index(drop=True, inplace=True)


In [1825]:
sampled_test.rename(columns={'section_name': 'label'}, inplace=True)


In [1826]:
sampled_test

Unnamed: 0,text,label
0,militants engaged former officials hamid karzai abdullah abdullah well moscow seek help building inclusive government cementing rule chaos persists kabul airport taliban discuss new government,0
1,american spirit gets lebanese makeover beirut potomac,1
2,pleasant protest tyranny workplace productivity letter recommendation drinking lunch,0
3,see latest charts maps coronavirus cases deaths hospitalizations dixon area lee county illinois covid case risk tracker,0
4,biden administration said measures would degrade ability wage war ukraine would prevent belarus channeling forbidden goods russia u extends technology restrictions belarus russian oil industry,0
...,...,...
95,see full results maps texas election texas election results th congressional district,0
96,see latest charts maps coronavirus cases deaths hospitalizations woodson county kansas woodson county kansas covid case risk tracker,0
97,see latest charts maps coronavirus cases deaths hospitalizations jesup area wayne county georgia covid case risk tracker,0
98,see full results maps kansas elections kansas election results,0


##### Sampled Test Exported in order to hand label based on author's perception of whether the abstract/headline coupling had a tendency towards bias (labeled with a 1) or did not have a tendency towards bias (labeled with a 0). 

In [1827]:
sampled_test.to_csv('/Users/ben/Desktop/DSI_GA_Materials/capstone/Capstone_Project_backup/Data/sampled_test_to_hand_label.csv')

In [1828]:
sampled_test = pd.read_csv('/Users/ben/Desktop/DSI_GA_Materials/capstone/Capstone_Project_backup/Data/sampled_test_hand_labeled.csv')

In [1829]:
sampled_test

Unnamed: 0.1,Unnamed: 0,text,label
0,0,militants engaged former officials hamid karzai abdullah abdullah well moscow seek help building inclusive government cementing rule chaos persists kabul airport taliban discuss new government,1
1,1,american spirit gets lebanese makeover beirut potomac,1
2,2,pleasant protest tyranny workplace productivity letter recommendation drinking lunch,1
3,3,see latest charts maps coronavirus cases deaths hospitalizations dixon area lee county illinois covid case risk tracker,0
4,4,biden administration said measures would degrade ability wage war ukraine would prevent belarus channeling forbidden goods russia u extends technology restrictions belarus russian oil industry,1
...,...,...,...
95,95,see full results maps texas election texas election results th congressional district,0
96,96,see latest charts maps coronavirus cases deaths hospitalizations woodson county kansas woodson county kansas covid case risk tracker,0
97,97,see latest charts maps coronavirus cases deaths hospitalizations jesup area wayne county georgia covid case risk tracker,0
98,98,see full results maps kansas elections kansas election results,0


In [1830]:
sampled_test = sampled_test.drop(columns=['Unnamed: 0'])

In [1831]:
sampled_test

Unnamed: 0,text,label
0,militants engaged former officials hamid karzai abdullah abdullah well moscow seek help building inclusive government cementing rule chaos persists kabul airport taliban discuss new government,1
1,american spirit gets lebanese makeover beirut potomac,1
2,pleasant protest tyranny workplace productivity letter recommendation drinking lunch,1
3,see latest charts maps coronavirus cases deaths hospitalizations dixon area lee county illinois covid case risk tracker,0
4,biden administration said measures would degrade ability wage war ukraine would prevent belarus channeling forbidden goods russia u extends technology restrictions belarus russian oil industry,1
...,...,...
95,see full results maps texas election texas election results th congressional district,0
96,see latest charts maps coronavirus cases deaths hospitalizations woodson county kansas woodson county kansas covid case risk tracker,0
97,see latest charts maps coronavirus cases deaths hospitalizations jesup area wayne county georgia covid case risk tracker,0
98,see full results maps kansas elections kansas election results,0


In [1832]:
Y_test = sampled_test.label.values

In [1833]:
Y_test

array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])

### Labeling Functions 

In [1834]:
lfs = [
    keyword_my_mislead,
    # keyword_hurt,
    # keyword_blame,
    # keyword_harm,
    # keyword_accuse,
    # keyword_succeeded,
    # keyword_achieves,
    # keyword_excels,
    # keyword_charts,
    # keyword_results,
    # keyword_hypocrisy,
    keyword_maps,
    keyword_county,
    keyword_election,
    keyword_coronavirus,
    keyword_case,
    keyword_risk,
    keyword_cases,
    keyword_covid,
    keyword_latest,
    keyword_trump, 
    keyword_ukraine, 
    keyword_russia, 
    keyword_war,
    keyword_reminiscent, 
    keyword_removes, 
    keyword_proceed, 
    keyword_ponder,
    regex_fake_news,
    regex_subjective,
    long_combined_text,
    textblob_polarity,
    textblob_subjectivity
]

In [1835]:
applier = PandasLFApplier(lfs)

In [1836]:
L_train = applier.apply(sampled_data)


 28%|██▊       | 283/1000 [00:02<00:04, 147.67it/s]

100%|██████████| 1000/1000 [00:08<00:00, 111.43it/s]


In [1837]:
L_test = applier.apply(df=sampled_test)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:01<00:00, 98.05it/s]


### Now it is important to access Polarity, Coverage, Overlaps, and Conflicts of Following Table and Use these to Guide Usage of Keyword Functions and whether or not it is necessary to create more

The following is from Snorkel documentation and goes into more depth as to what these summary statistics terms mean: 

- Polarity: The set of unique labels this LF outputs (excluding abstains)
- Coverage: The fraction of the dataset the LF labels
-  Overlaps: The fraction of the dataset where this LF and at least one other LF label
- Conflicts: The fraction of the dataset where this LF and at least one other LF label and disagree
- Correct: The number of data points this LF labels correctly (if gold labels are provided)
- Incorrect: The number of data points this LF labels incorrectly (if gold labels are provided)
- Empirical Accuracy: The empirical accuracy of this LF (if gold labels are provided)

In [1838]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
keyword_my_mislead,0,[1],1.0,0.433,0.076
keyword_maps,1,[0],0.001,0.001,0.001
keyword_county,2,[0],0.002,0.002,0.002
keyword_election,3,[0],0.035,0.035,0.035
keyword_coronavirus,4,[],0.0,0.0,0.0
keyword_case,5,[0],0.02,0.02,0.02
keyword_risk,6,[0],0.009,0.009,0.009
keyword_cases,7,[0],0.005,0.005,0.005
keyword_covid,8,[],0.0,0.0,0.0
keyword_latest,9,[0],0.011,0.011,0.011


In [None]:
buckets = get_label_buckets(L_train[:, 0], L_train[:, 1])
df_train.iloc[buckets[(ABSTAIN, BIASED)]].sample(10, random_state=1)

##### Further labeling 

In [1839]:
majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

In [1840]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.505]
  7%|▋         | 34/500 [00:00<00:01, 330.01epoch/s]

 17%|█▋        | 84/500 [00:00<00:00, 428.28epoch/s]INFO:root:[100 epochs]: TRAIN:[loss=0.001]
 32%|███▏      | 158/500 [00:00<00:00, 567.89epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.000]
 47%|████▋     | 235/500 [00:00<00:00, 646.40epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.000]
 64%|██████▎   | 318/500 [00:00<00:00, 710.49epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.000]
100%|██████████| 500/500 [00:00<00:00, 708.90epoch/s]
INFO:root:Finished Training


In [1841]:
majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Majority Vote Accuracy:   72.0%
Label Model Accuracy:     68.0%


### Label Model Accuracy could be improved in future versions