In [6]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Dataset Overview

In [7]:
df = pd.read_csv("/Users/enrique/code/EFRdev/08-Final-Project/SolarSoundBytes/raw_data/guardian_environment_news.csv")
df.head()

Unnamed: 0,Title,Intro Text,Authors,Article Text,Date Published
0,Liz Truss ‘will approve more oil drilling if ...,Tory leadership candidate criticised by campai...,"['Rob Davies', '@ByRobDavies']",Liz Truss will sign off on a push for more oil...,2022-08-30
1,Renewed Highland golf course plan has environm...,Scottish government rejected a new links at Co...,"['Ewan Murray', '@mrewanmurray']",It is an area so tranquil that the notion of b...,2021-03-22
2,Visiting green spaces deters mental health dr...,Positive effects were stronger among those rep...,"['Damien Gayle', '@damiengayle']","Visits to parks, community gardens and other u...",2023-01-17
3,Bought too much red cabbage? Turn it into a fe...,This fantastic vegan centrepiece makes full us...,['Tom Hunt'],"I devised today’s nut roast for Oddbox, a veg ...",2023-12-22
4,‘This year has been very good’: readers’ UK bu...,Readers share their favourite sightings over t...,['Guardian readers'],‘Constant companions to our gardening’A peacoc...,2023-12-19


In [8]:
df.shape

(30059, 5)

# Delete or Impute Nulls Values

In [9]:
print(df.isnull().sum())

Title              948
Intro Text          82
Authors           4570
Article Text       368
Date Published    2441
dtype: int64


In [10]:
df = df.dropna(subset =['Article Text', 'Date Published', 'Authors'])
df.shape

(23404, 5)

In [11]:
df['Title'] = df['Title'].fillna('No Title')
df['Intro Text'] = df['Intro Text'].fillna('No Intro Text')

In [12]:
print(df.isnull().sum())

df.shape

Title             0
Intro Text        0
Authors           0
Article Text      0
Date Published    0
dtype: int64


(23404, 5)

In [13]:
df.head()

Unnamed: 0,Title,Intro Text,Authors,Article Text,Date Published
0,Liz Truss ‘will approve more oil drilling if ...,Tory leadership candidate criticised by campai...,"['Rob Davies', '@ByRobDavies']",Liz Truss will sign off on a push for more oil...,2022-08-30
1,Renewed Highland golf course plan has environm...,Scottish government rejected a new links at Co...,"['Ewan Murray', '@mrewanmurray']",It is an area so tranquil that the notion of b...,2021-03-22
2,Visiting green spaces deters mental health dr...,Positive effects were stronger among those rep...,"['Damien Gayle', '@damiengayle']","Visits to parks, community gardens and other u...",2023-01-17
3,Bought too much red cabbage? Turn it into a fe...,This fantastic vegan centrepiece makes full us...,['Tom Hunt'],"I devised today’s nut roast for Oddbox, a veg ...",2023-12-22
4,‘This year has been very good’: readers’ UK bu...,Readers share their favourite sightings over t...,['Guardian readers'],‘Constant companions to our gardening’A peacoc...,2023-12-19


# Text Cleaning:
**Preprocessing:** lowercase, delete numbers, punctuation and symbols (#"*!&%), splitting, tokenizing?, removing stopwords, lemmatizing

In [14]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("","", string.punctuation))
    text = ''.join(char for char in text if not char.isdigit())

    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

df['Clean Title'] = df['Title'].apply(preprocess_text)
df['Clean Intro Text'] = df['Intro Text'].apply(preprocess_text)
df['Clean Article Text'] = df['Article Text'].apply(preprocess_text)
df['Author_Domain'] = df['Authors'].apply(preprocess_text)
df_clean = df.copy()

df_clean = df.drop(columns=['Title','Intro Text','Article Text', 'Authors'])
df_clean.head()

Unnamed: 0,Date Published,Clean Title,Clean Intro Text,Clean Article Text,Author_Domain
0,2022-08-30,liz truss ‘will approve oil drilling becomes pm’,tory leadership candidate criticised campaigne...,liz truss sign push oil drilling north sea win...,rob davy byrobdavies
1,2021-03-22,renewed highland golf course plan environmenta...,scottish government rejected new link coul pro...,area tranquil notion bitter dispute hugely ano...,ewan murray mrewanmurray
2,2023-01-17,visiting green space deters mental health drug...,positive effect stronger among reporting lowes...,visit park community garden urban green space ...,damien gayle damiengayle
3,2023-12-22,bought much red cabbage turn festive nut roast...,fantastic vegan centrepiece make full use oute...,devised today’s nut roast oddbox veg box outfi...,tom hunt
4,2023-12-19,‘this year good’ readers’ uk butterfly sighting,reader share favourite sighting summer news nu...,‘constant companion gardening’a peacock butter...,guardian reader


In [15]:
df_clean['Date Published'] = pd.to_datetime(df_clean['Date Published'], dayfirst=True, errors='coerce')
print(df_clean['Date Published'].dtype)

datetime64[ns]


  df_clean['Date Published'] = pd.to_datetime(df_clean['Date Published'], dayfirst=True, errors='coerce')


In [17]:
df_clean.head()

Unnamed: 0,Date Published,Clean Title,Clean Intro Text,Clean Article Text,Author_Domain
0,2022-08-30,liz truss ‘will approve oil drilling becomes pm’,tory leadership candidate criticised campaigne...,liz truss sign push oil drilling north sea win...,rob davy byrobdavies
1,2021-03-22,renewed highland golf course plan environmenta...,scottish government rejected new link coul pro...,area tranquil notion bitter dispute hugely ano...,ewan murray mrewanmurray
2,2023-01-17,visiting green space deters mental health drug...,positive effect stronger among reporting lowes...,visit park community garden urban green space ...,damien gayle damiengayle
3,2023-12-22,bought much red cabbage turn festive nut roast...,fantastic vegan centrepiece make full use oute...,devised today’s nut roast oddbox veg box outfi...,tom hunt
4,2023-12-19,‘this year good’ readers’ uk butterfly sighting,reader share favourite sighting summer news nu...,‘constant companion gardening’a peacock butter...,guardian reader


# Sentiment Analysis

In [18]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [19]:
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")


Device set to use mps:0


In [20]:
def analyze_sentiment(text):
    try:
        result = sentiment_pipeline(text[:500])[0]  # Evitar textos muy largos
        return pd.Series([result['label'], result['score']])
    except:
        return pd.Series([None, None])

In [23]:
df_clean[['sentiment', 'sentiment_conf']] = df_clean['Clean Article Text'].apply(analyze_sentiment)
df_clean.head()

Unnamed: 0,Date Published,Clean Title,Clean Intro Text,Clean Article Text,Author_Domain,sentiment,sentiment_conf
0,2022-08-30,liz truss ‘will approve oil drilling becomes pm’,tory leadership candidate criticised campaigne...,liz truss sign push oil drilling north sea win...,rob davy byrobdavies,NEGATIVE,0.988882
1,2021-03-22,renewed highland golf course plan environmenta...,scottish government rejected new link coul pro...,area tranquil notion bitter dispute hugely ano...,ewan murray mrewanmurray,NEGATIVE,0.98986
2,2023-01-17,visiting green space deters mental health drug...,positive effect stronger among reporting lowes...,visit park community garden urban green space ...,damien gayle damiengayle,NEGATIVE,0.973724
3,2023-12-22,bought much red cabbage turn festive nut roast...,fantastic vegan centrepiece make full use oute...,devised today’s nut roast oddbox veg box outfi...,tom hunt,POSITIVE,0.982985
4,2023-12-19,‘this year good’ readers’ uk butterfly sighting,reader share favourite sighting summer news nu...,‘constant companion gardening’a peacock butter...,guardian reader,POSITIVE,0.994466


In [25]:
df_clean['sentiment'].value_counts()

sentiment
NEGATIVE    19784
POSITIVE     3620
Name: count, dtype: int64

In [28]:
df_clean.sample(5)

Unnamed: 0,Date Published,Clean Title,Clean Intro Text,Clean Article Text,Author_Domain,sentiment,sentiment_conf
7781,2022-08-25,solar panel fix energy bill sun shine,demand go roof struggling british household tr...,british household racing install rooftop solar...,patrick collinson,NEGATIVE,0.577317
16559,2019-11-24,uk weather north england brace heavy rainfall,weather warning place lincolnshire nottinghams...,part northern england midland devastated sever...,josh halliday north england correspondent,NEGATIVE,0.993934
4016,2022-10-19,fish oil fishmeal industry harming food securi...,campaigner say sector lead overexploitation st...,un’s food agency warned “overexploitation” fis...,seascape state ocean supported content karen m...,NEGATIVE,0.998585
7317,2021-10-31,australia’s net zero plan could cost far bn al...,energy minister refuse detail full cost reachi...,coalition’s “technology taxes” plan net zero e...,katharine murphy paul karp,NEGATIVE,0.992748
18899,2020-08-25,pupils’ climate change strike threat pose dile...,thousand pupil set absent february putting sch...,headteachers across country week faced tricky ...,jamie doward,NEGATIVE,0.993159
