In [27]:
import pandas as pd
#Snorkel
from snorkel.labeling import LabelingFunction
import re
from snorkel.preprocess import preprocessor
from textblob import TextBlob
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import labeling_function
import nltk
import nltk.tokenize
import string
from nltk.corpus import stopwords
punc = string.punctuation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


#Tensorflow and Keras ML model libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
df = pd.read_csv('india-news-headlines.csv')
df.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [17]:
df['headline_category'].unique()

array(['unknown', 'entertainment.hindi.bollywood', 'india', ...,
       'sports.football.euro-2021', 'business.markets.ipo',
       'sports.tokyo-olympics.india-in-tokyo'], dtype=object)

In [18]:
df = df.drop(['publish_date', 'headline_category'], axis=1)
df.head()


Unnamed: 0,headline_text
0,Status quo will not be disturbed at Ayodhya; s...
1,Fissures in Hurriyat over Pak visit
2,America's unwanted heading for India?
3,For bigwigs; it is destination Goa
4,Extra buses to clear tourist traffic


Using Snorkel for the labelling of headlines with classes of positive and negative

In [32]:
pos = 1
neg = 0
#neural
neu = -1

# function that inputs words to represent a label
def keyword_lookup(x, keywords, label):
    if any(word in x.headline_text.lower() for word in keywords):
        return label
    return neu

#function for correctly assigning a label
def make_keyword(keywords, label):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label)) 

In [21]:
# I asked chat GPT to produce words for both positive and negative words that may be found within an article

# Positive words
positive_words = ["Inspiring","Motivating","Encouraging","Empowering","Enriching","Fulfilling","Joyful","Uplifting","Heartwarming",
    "Exciting","Optimistic","Radiant","Hopeful","Refreshing","Transformative","Grateful","Compassionate","Respectful","Honest",
    "Supportive","Friendly","Amicable","Cheerful","Energetic","Passionate","Confident","Creative","Enthusiastic","Dynamic","Courageous",
    "Adventurous","Fearless","Bold","Brave","Determined","Ambitious","Successful","Accomplished","Triumphant","Victorious","Celebrated",
    "Esteemed","Honored","Proud","Satisfied","Content","Joyous","Blissful","Ecstatic","Radiant","Gracious","Generous","Kind","Compassionate",
    "Understanding","Accepting","Appreciative","Respectful","Patient","Forgiving","Calm","Serene","Tranquil","Relaxed","Peaceful",
    "Harmonious","Balanced","Healthy","Vibrant","Invigorated","Rejuvenated","Energetic","Alive","Connected","Engaged","Inspirational",
    "Uplifting","Motivational","Encouraging","Supportive","Nurturing","Affirming","Reassuring","Hopeful","Optimistic","Confident",
    "Secure","Empowered","Enlightened","Aware","Mindful","Grateful","Thankful","Appreciative"]
# Negative words
negative_words = ["Disappointing","Frustrating","Discouraging","Upsetting","Unfortunate","Sad","Heartbreaking",
    "Tragic","Painful","Hurtful","Offensive","Insulting","Insensitive","Harmful","Damaging","Destructive","Terrible","Horrible",
    "Awful","Miserable","Depressing","Desperate","Hopeless","Dismal","Gloomy","Melancholy", "Sadness","Sorrow","Anguish","Agony",
    "Misery","Pessimistic","Cynical","Skeptical","Suspicious","Disbelieving","Doubtful","Unbelievable","Displeased","Unsatisfied",
    "Unhappy","Dissatisfied","Angry","Furious","Enraged","Infuriated","Annoyed","Irritated","Frustrated","Exasperated","Stressed",
    "Anxious","Nervous","Tense","Worried","Fearful","Afraid","Scared","Terrified","Panicked","Despair","Despondent","Defeated",
    "Surrendered","Powerless","Inferior","Inadequate","Unimportant","Worthless","Insignificant","Unworthy","Ashamed","Embarrassed",
    "Guilty","Remorseful","Regretful","Mortified","Humiliated","Disgusted","Repulsed","Revolted","Nauseated","Appalled","Horrified",
    "Offended","Resentful","Bitter","Hostile","Vengeful","Hateful","Malicious"]

In [33]:
#  I will now add these words to the make keyword function
positive_keyword = make_keyword(keywords=positive_words, label=pos)
negative_keyword = make_keyword(keywords=negative_words, label=neg)


Will use the TextBlob library to determine the polarity and subjectivity scores

In [34]:
#
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.headline_text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x
#find polarity
@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return pos if x.polarity > 0.6 else neu
#find subjectivity 
@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return pos if x.subjectivity >= 0.5 else neu

In [None]:
#combine all the labeling functions 
lfs = [positive_keyword, negative_keyword, textblob_polarity, textblob_subjectivity ]
#apply the lfs on the dataframe
applier = PandasLFApplier(lfs=lfs)
L_snorkel = applier.apply(df=df)
#apply the label model
label_model = LabelModel(cardinality=2, verbose=True)
#fit on the data
label_model.fit(L_snorkel)
#predict and create the labels
df["label"] = label_model.predict(L=L_snorkel)

In [None]:
df['label'].value_counts()