Pre installs

In [1]:
# install needed packages
# !pip install snorkel
# !pip install textblob

##Imports


In [49]:
from bs4 import BeautifulSoup
import requests 
import json
import pandas as pd
import plotly.graph_objects as go


from snorkel.labeling import LabelingFunction
import re
from snorkel.preprocess import preprocessor
from textblob import TextBlob
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import labeling_function

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

##Web scraping

In [3]:

def get_headlines(headline_count=10):
    content = requests.get('https://www.aljazeera.com/where/mozambique/')
    soup = BeautifulSoup(content.content, 'html.parser')
    # to keep the data in json format
    top_headlines = {}
    all_a = soup.find_all('article')
    for i in range(headline_count):
        headline = all_a[i].find('h3')
        text = headline.get_text().strip()
        text = ascii(text).replace('\\xad','')
        # to get back the ascii values 
        text = text.replace('\\n','')
        text = text.replace('\\u2019',"'")
        top_headlines.update({i: {'Headline': text}})
    return top_headlines

Converting the output of web scraping in JSON format

In [None]:
# run this cell to get the JSON file ONLY.
headlines = get_headlines()
with open('headlines.json', 'w') as fp:
    json.dump(headlines, fp)

#Training a custom model

##Data Extraction and Preprocessing

In [4]:
# extracting data from my cloud.
# to run this, please put the dataset file location.
#cloud
dataset_link = '/content/drive/MyDrive/Colab Notebooks/Data/data.csv'
#downloaded from github
# dataset_link = "dataset/data.csv"

df = pd.read_csv(dataset_link)

# data cleaning
df = df.drop(['publish_date'], axis=1)
df = df.rename(columns = {'headline_text': 'text'})
df['text'] = df['text'].astype(str)

# using only first 100000 data
df = df.head(100000)

Positive words and Negative words.

In [5]:
positive_words = ['boosts',
                 'great',
                 'develops',
                 'promising',
                 'ambitious',
                 'delighted',
                 'record',
                 'win',
                 'breakthrough',
                 'recover',
                 'achievement',
                 'peace',
                 'party',
                 'hope',
                 'flourish',
                 'respect',
                 'partnership',
                 'champion',
                 'positive',
                 'happy',
                 'bright',
                 'confident',
                 'encouraged',
                 'perfect',
                 'complete',
                 'assured' 
                 
]
negative_words = ['war',
                  'solidiers', 
                  'turmoil', 
                  'injur',
                  'trouble', 
                  'aggressive', 
                  'killed', 
                  'coup', 
                  'evasion', 
                  'strike', 
                  'troops', 
                  'dismisses', 
                  'attacks', 
                  'defeat', 
                  'damage', 
                  'dishonest', 
                  'dead', 
                  'fear', 
                  'foul', 
                  'fails', 
                  'hostile', 
                  'cuts', 
                  'accusations', 
                  'victims',  
                  'death', 
                  'unrest', 
                  'fraud', 
                  'dispute', 
                  'destruction', 
                  'battle', 
                  'unhappy', 
                  'bad', 
                  'alarming', 
                  'angry', 
                  'anxious', 
                  'dirty', 
                  'pain', 
                  'poison', 
                  'unfair', 
                  'unhealthy',
                  'trump',
                  'mueller',
                  'criminal',
                  'judge',
                  'mccabe',
                  'court',
                  'contempt',             
]
  

In [6]:
POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1
# to look up words and provide proper labels
def keyword_lookup(x, keywords, label):  
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN
# assigning correct labels
def labeling_keywords(keywords, label=POSITIVE):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))

positive_keywords = labeling_keywords(keywords=positive_words)
negative_keywords = labeling_keywords(keywords=negative_words, label=NEGATIVE)

In [10]:
# determining polarity and subjectivity using textblob pretrained classifier 
@preprocessor(memoize=True)
def sentiment_by_textblob(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x
# polarity
@labeling_function(pre=[sentiment_by_textblob])
def _polarity(x):
    return POSITIVE if x.polarity > 0.6 else ABSTAIN
# subjectivity 
@labeling_function(pre=[sentiment_by_textblob])
def _subjectivity(x):
    return POSITIVE if x.subjectivity >= 0.5 else ABSTAIN

In [13]:

#combining label functions
lfs  = [positive_keywords, negative_keywords, _polarity, _subjectivity]
applier = PandasLFApplier(lfs=lfs)

L_snorkel = applier.apply(df=df)
label_model = LabelModel(cardinality=2, verbose=True)

label_model.fit(L_snorkel)
df["label"] = label_model.predict(L=L_snorkel)

100%|██████████| 100000/100000 [02:28<00:00, 671.94it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.016]
  1%|          | 1/100 [00:00<00:11,  8.70epoch/s]INFO:root:[10 epochs]: TRAIN:[loss=0.006]
INFO:root:[20 epochs]: TRAIN:[loss=0.000]
INFO:root:[30 epochs]: TRAIN:[loss=0.001]
INFO:root:[40 epochs]: TRAIN:[loss=0.001]
INFO:root:[50 epochs]: TRAIN:[loss=0.000]
INFO:root:[60 epochs]: TRAIN:[loss=0.000]
INFO:root:[70 epochs]: TRAIN:[loss=0.000]
 73%|███████▎  | 73/100 [00:00<00:00, 401.83epoch/s]INFO:root:[80 epochs]: TRAIN:[loss=0.000]
INFO:root:[90 epochs]: TRAIN:[loss=0.000]
100%|██████████| 100/100 [00:00<00:00, 397.20epoch/s]
INFO:root:Finished Training


In [14]:
# Filtering unlabeled data
df= df.loc[df.label.isin([0,1]), :]

In [15]:
data = df.copy()

text = list(data['text'])
labels = list(data['label'])
# training and test data of text
training_text = text[0:20000]
testing_text = text[20000:]
# training and test data of labels
training_labels = labels[0:20000]
testing_labels = labels[20000:]

In [16]:
# preprocessing
tokenizer = Tokenizer(num_words=10000, oov_token= "<OOV>")
tokenizer.fit_on_texts(training_text)

word_index = tokenizer.word_index
#sequencing and padding
training_sequences = tokenizer.texts_to_sequences(training_text)
training_padded = pad_sequences(training_sequences, maxlen=100, padding='post', truncating='post')
testing_sequences = tokenizer.texts_to_sequences(testing_text)
testing_padded = pad_sequences(testing_sequences, maxlen=100, padding='post', truncating='post')
# TensorFlow input is in np array 
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [17]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=100),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
 
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 24)                408       
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [18]:
num_epochs = 15
history = model.fit(training_padded, 
                    training_labels, 
                    epochs=num_epochs, 
                    validation_data=(testing_padded, testing_labels), 
                    verbose=2)

Epoch 1/15
625/625 - 4s - loss: 0.6356 - accuracy: 0.6332 - val_loss: 0.5364 - val_accuracy: 0.6886 - 4s/epoch - 6ms/step
Epoch 2/15
625/625 - 3s - loss: 0.3306 - accuracy: 0.8854 - val_loss: 0.2338 - val_accuracy: 0.9167 - 3s/epoch - 4ms/step
Epoch 3/15
625/625 - 2s - loss: 0.1522 - accuracy: 0.9553 - val_loss: 0.1500 - val_accuracy: 0.9509 - 2s/epoch - 4ms/step
Epoch 4/15
625/625 - 2s - loss: 0.0958 - accuracy: 0.9722 - val_loss: 0.1275 - val_accuracy: 0.9544 - 2s/epoch - 4ms/step
Epoch 5/15
625/625 - 3s - loss: 0.0678 - accuracy: 0.9804 - val_loss: 0.1067 - val_accuracy: 0.9611 - 3s/epoch - 4ms/step
Epoch 6/15
625/625 - 3s - loss: 0.0497 - accuracy: 0.9855 - val_loss: 0.0947 - val_accuracy: 0.9653 - 3s/epoch - 4ms/step
Epoch 7/15
625/625 - 3s - loss: 0.0383 - accuracy: 0.9898 - val_loss: 0.0936 - val_accuracy: 0.9665 - 3s/epoch - 4ms/step
Epoch 8/15
625/625 - 3s - loss: 0.0297 - accuracy: 0.9922 - val_loss: 0.0873 - val_accuracy: 0.9675 - 3s/epoch - 4ms/step
Epoch 9/15
625/625 - 3s 

In [54]:
print('Headlines:\n')
Analysis_results = []
for headline in get_headlines().values():
    new_headline = headline['Headline']
    print(new_headline)
    sequences = tokenizer.texts_to_sequences([new_headline])
    padded_seqs = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')
    Analysis_results.append(model.predict(padded_seqs)[0][0])

Headlines:

'Floods hit South Africa's KwaZulu-Natal province again'
'Mozambique: Cyclone Gombe death toll rises to 53'
'Mozambique announces new prime minister after cabinet reshuffle'
'Analysis: Can African gas replace Russian supplies to Europe?'
'Dozens dead from Tropical Storm Ana in southern Africa'
'Southern Africa bloc SADC extends Mozambique mission'
'Climate change and famine | Start Here'
'In Mozambique, Kagame says Rwandan troops' work not over'
'Rwanda, Mozambique forces recapture port city from rebels'
'Rwanda deploys 1,000 soldiers to Mozambique's Cabo Delgado'


In [51]:
print('Towards 1 is positive sentiment and Towards 0 is negative sentiment\n')
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=list(range(1,len(Analysis_results)+1)),
    y=Analysis_results
))

fig.update_layout(
    autosize=False,
    width=700,
    height=500,
    paper_bgcolor='lightgrey'
)

# showing the plot
fig.show()

Towards 1 is positive sentiment and Towards 0 is negative sentiment

