In [1]:
import pandas as pd
import re
import numpy as np
import string
import nltk
from textblob import TextBlob
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.models import load_model
import nest_asyncio
from requests_html import HTMLSession
from bs4 import BeautifulSoup as bs



Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
df = pd.read_csv('abcnews-date-text.csv')
df = df.rename(columns = {'headline_text': 'text'})
df['text'] = df['text'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1244184 entries, 0 to 1244183
Data columns (total 2 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   publish_date  1244184 non-null  int64 
 1   text          1244184 non-null  object
dtypes: int64(1), object(1)
memory usage: 19.0+ MB


In [3]:
stopword = nltk.corpus.stopwords.words('english')

def clean_text(text):
    check = string.punctuation
    def remove_punctuation(text):
        no_punct=[words for words in text if words not in check]
        words_wo_punct=''.join(no_punct)
        return tokenize(words_wo_punct)
    
    def tokenize(text):
        split=re.split("\W+",text) 
        return remove_stopwords(split)

    def remove_stopwords(text):
        text=[word for word in text if word not in stopword]
        return text
    return remove_punctuation(text)


In [4]:
cleaned = []

for i in df['text']:
    cleaned.append(clean_text(i.lower()))

df['text'] = cleaned


In [5]:
polarity = []

def getAnalysis(polarity):
        if score < 0:
            return 0
        elif score > 0:
            return 1
        else:
            return -1
            
for i in range(len(df['text'])):
    text = " ".join(df['text'][i])
    score = TextBlob(text).sentiment.polarity
    polarity.append(getAnalysis(score))

df['label'] = polarity

df.drop(df[df['label'] == -1].index, inplace=True)

df.to_csv("training.csv")


In [6]:
data = pd.read_csv('training.csv')

text = list(data['text'])
labels = list(data['label'])

training_text = text[0:15000]
testing_text = text[15000:]

training_labels = labels[0:15000]
testing_labels = labels[15000:]

In [7]:
tokenizer = Tokenizer(num_words=10000, oov_token= "<OOV>")
tokenizer.fit_on_texts(training_text)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_text)
training_padded = pad_sequences(training_sequences, maxlen=120, padding='post', truncating='post')
testing_sequences = tokenizer.texts_to_sequences(testing_text)
testing_padded = pad_sequences(testing_sequences, maxlen=120, padding='post', truncating='post')

training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=120),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
 
model.summary()

Metal device set to: Apple M1
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


2024-06-18 21:15:14.885239: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-18 21:15:14.885901: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [9]:
num_epochs = 10
history = model.fit(training_padded, 
                    training_labels, 
                    epochs=num_epochs, 
                    validation_data=(testing_padded, testing_labels), 
                    verbose=2)

2024-06-18 21:15:14.998013: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2024-06-18 21:15:15.000635: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/10


2024-06-18 21:15:15.173416: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2024-06-18 21:15:19.030573: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


469/469 - 34s - loss: 0.6767 - accuracy: 0.5818 - val_loss: 0.6731 - val_accuracy: 0.5655
Epoch 2/10
469/469 - 34s - loss: 0.5754 - accuracy: 0.7036 - val_loss: 0.4685 - val_accuracy: 0.7439
Epoch 3/10
469/469 - 33s - loss: 0.2688 - accuracy: 0.9357 - val_loss: 0.2440 - val_accuracy: 0.9319
Epoch 4/10
469/469 - 33s - loss: 0.1363 - accuracy: 0.9668 - val_loss: 0.1862 - val_accuracy: 0.9381
Epoch 5/10
469/469 - 34s - loss: 0.0890 - accuracy: 0.9776 - val_loss: 0.1517 - val_accuracy: 0.9492
Epoch 6/10
469/469 - 33s - loss: 0.0640 - accuracy: 0.9839 - val_loss: 0.1337 - val_accuracy: 0.9534
Epoch 7/10
469/469 - 33s - loss: 0.0483 - accuracy: 0.9873 - val_loss: 0.1265 - val_accuracy: 0.9550
Epoch 8/10
469/469 - 33s - loss: 0.0375 - accuracy: 0.9905 - val_loss: 0.1187 - val_accuracy: 0.9579
Epoch 9/10
469/469 - 33s - loss: 0.0291 - accuracy: 0.9933 - val_loss: 0.1154 - val_accuracy: 0.9590
Epoch 10/10
469/469 - 33s - loss: 0.0234 - accuracy: 0.9947 - val_loss: 0.1133 - val_accuracy: 0.9597


In [10]:
nest_asyncio.apply()
session = HTMLSession()
urls = []

r = session.get('https://edition.cnn.com/opinions')
    
html_str = r.text
    
soup = bs(html_str, "html.parser")
        
content = soup.find_all('a', class_="container__link container__link--type-article container_lead-plus-headlines__link container_lead-plus-headlines__left container_lead-plus-headlines__light")

for x in range(len(content)):
    urls.append('https://edition.cnn.com'+content[x]['href'])


In [11]:
df = pd.DataFrame()

df['urls'] = urls

In [12]:
titles = []
nest_asyncio.apply()
session = HTMLSession()

for i in range(len(urls)):
    r = session.get(urls[i])

    html_str = r.text

    soup = bs(html_str, "html.parser")
    
    content = soup.find_all("h1", class_="headline__text inline-placeholder vossi-headline-primary-core-light")
    
    for x in range(len(content)):
        titles.append(content[x].text[16:])

df['titles'] = titles


In [14]:
model.save('sentiment_analysis_model.h5')

In [15]:
model = load_model('sentiment_analysis_model.h5')

In [16]:
cleaned = []

for i in df['titles']:
    cleaned.append(clean_text(i.lower()))
    
df['cleaned_titles'] = cleaned

df.to_csv("predictions.csv")


In [17]:
text = list(df['cleaned_titles'])

tokenizer = Tokenizer(num_words=10000, oov_token= "<OOV>")
tokenizer.fit_on_texts(text)


In [24]:
predictions = []

for i in df['cleaned_titles']:
    title = [x.lower() for x in i]
    t = [" ".join(title)]
    sequences = tokenizer.texts_to_sequences(t)
    padded_seqs = pad_sequences(sequences, maxlen=120, padding='post', truncating='post')
    result = model.predict(padded_seqs)
    predictions.append(round(result[0][0]))

In [26]:
df['results'] = predictions

In [27]:
df.to_csv("predictions.csv")