In [5]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime


# Dataset Overview

In [6]:
df = pd.read_csv("/Users/enrique/code/EFRdev/08-Final-Project/SolarSoundBytes/raw_data/cleantech_media_dataset_v3_2024-10-28.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
0,93320,"XPeng Delivered ~100,000 Vehicles In 2021",2022-01-02,,['Chinese automotive startup XPeng has shown o...,cleantechnica,https://cleantechnica.com/2022/01/02/xpeng-del...
1,93321,Green Hydrogen: Drop In Bucket Or Big Splash?,2022-01-02,,['Sinopec has laid plans to build the largest ...,cleantechnica,https://cleantechnica.com/2022/01/02/its-a-gre...
2,98159,World’ s largest floating PV plant goes online...,2022-01-03,,['Huaneng Power International has switched on ...,pv-magazine,https://www.pv-magazine.com/2022/01/03/worlds-...
3,98158,Iran wants to deploy 10 GW of renewables over ...,2022-01-03,,"['According to the Iranian authorities, there ...",pv-magazine,https://www.pv-magazine.com/2022/01/03/iran-wa...
4,31128,Eastern Interconnection Power Grid Said ‘ Bein...,2022-01-03,,['Sign in to get the best natural gas news and...,naturalgasintel,https://www.naturalgasintel.com/eastern-interc...


In [7]:
df.shape


(20111, 7)

In [8]:
df['date'] = pd.to_datetime(df['date'], dayfirst=True, errors='coerce')


In [9]:
df['date'].isna().sum()

12966

In [10]:
unique_domains = df['domain'].unique()
unique_domains

array(['cleantechnica', 'pv-magazine', 'naturalgasintel', 'energyvoice',
       'solarpowerworldonline', 'solarindustrymag', 'thinkgeoenergy',
       'energy-xprt', 'azocleantech', 'pv-tech', 'rechargenews',
       'solarpowerportal.co', 'energyintel', 'greenprophet', 'ecofriend',
       'eurosolar', 'greenairnews', 'all-energy', 'iea', 'decarbxpo',
       'biofuels-news', 'solarquarter', 'storagesummit', 'indorenergy',
       'bex-asia'], dtype=object)

In [11]:
uk_domains = ['energyvoice', 'solarpowerportal.co', 'biofuels-news', 'pv-magazine', 'thinkgeoenergy', 'cleantechnica']
df_uk = df[df['domain'].isin(uk_domains)].copy()
df_uk

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
0,93320,"XPeng Delivered ~100,000 Vehicles In 2021",2022-02-01,,['Chinese automotive startup XPeng has shown o...,cleantechnica,https://cleantechnica.com/2022/01/02/xpeng-del...
1,93321,Green Hydrogen: Drop In Bucket Or Big Splash?,2022-02-01,,['Sinopec has laid plans to build the largest ...,cleantechnica,https://cleantechnica.com/2022/01/02/its-a-gre...
2,98159,World’ s largest floating PV plant goes online...,2022-03-01,,['Huaneng Power International has switched on ...,pv-magazine,https://www.pv-magazine.com/2022/01/03/worlds-...
3,98158,Iran wants to deploy 10 GW of renewables over ...,2022-03-01,,"['According to the Iranian authorities, there ...",pv-magazine,https://www.pv-magazine.com/2022/01/03/iran-wa...
6,42776,BP: Commitment to Scotland is ‘ unique differe...,2022-03-01,,['BP’ s “ long-term ” commitment to Scotland i...,energyvoice,https://www.energyvoice.com/renewables-energy-...
...,...,...,...,...,...,...,...
20101,101432,European Commission offers share of €4.8 billi...,NaT,,['The European Commission is offering 85 net-z...,pv-magazine,https://www.pv-magazine.com/2024/10/24/europea...
20102,101433,Sunwind Energy’ s new PV planter combines cult...,NaT,,['Specializing in the design of custom solar s...,pv-magazine,https://www.pv-magazine.com/2024/10/24/sunwind...
20108,101434,Australia has 7.8 GW of utility-scale batterie...,NaT,,['The volume of large-scale battery energy sto...,pv-magazine,https://www.pv-magazine.com/2024/10/24/austral...
20109,101428,Residential PV prices in Germany drop 25% with...,NaT,,"['The comparison site Selfmade Energy shows, i...",pv-magazine,https://www.pv-magazine.com/2024/10/24/residen...


In [12]:
df_uk.shape

(7913, 7)

In [13]:
df_uk['date'].isna().sum()

4547

## There are NOT a lot of dates in this dataset! 
Luckly this dataset contains URL column where I can fetch the date in the format "YYYY/MM/DD"

In [14]:
def extract_date_from_url(url):
    # Match YYYY/MM/DD pattern
    match = re.search(r'/(\d{4})/(\d{2})/(\d{2})/', url)
    if match:
        try:
            # Format to standard datetime or string
            return datetime.strptime('/'.join(match.groups()), '%Y/%m/%d').date()
        except ValueError:
            return None
    return None

# Apply only to missing dates
df_uk['date'] = df_uk.apply(
    lambda row: extract_date_from_url(row['url']) if pd.isna(row['date']) else row['date'],
    axis=1
)

In [15]:
df_uk[df_uk['date'].isna()]['url'].head

<bound method NDFrame.head of 150      https://www.solarpowerportal.co.uk/cfd_portfol...
154      https://www.thinkgeoenergy.com/drilling-begins...
156      https://www.thinkgeoenergy.com/bc-canada-first...
170      https://www.energyvoice.com/renewables-energy-...
176      https://www.energyvoice.com/renewables-energy-...
                               ...                        
20082    https://www.pv-magazine.com/press-releases/sol...
20086    https://www.pv-magazine.com/press-releases/jin...
20091    https://www.pv-magazine.com/press-releases/hua...
20094    https://www.pv-magazine.com/press-releases/atw...
20095    https://www.pv-magazine.com/webinars/the-impac...
Name: url, Length: 2093, dtype: object>

In [16]:
df_uk['date'].isna().sum()


2093

There are still 2093 articles without date... we are going to drop them.

In [17]:
df_uk = df_uk.dropna(subset=['date']).copy()
df_uk.shape

(5820, 7)

# Delete or Impute Nulls Values

In [18]:
df_uk.head()

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
0,93320,"XPeng Delivered ~100,000 Vehicles In 2021",2022-02-01 00:00:00,,['Chinese automotive startup XPeng has shown o...,cleantechnica,https://cleantechnica.com/2022/01/02/xpeng-del...
1,93321,Green Hydrogen: Drop In Bucket Or Big Splash?,2022-02-01 00:00:00,,['Sinopec has laid plans to build the largest ...,cleantechnica,https://cleantechnica.com/2022/01/02/its-a-gre...
2,98159,World’ s largest floating PV plant goes online...,2022-03-01 00:00:00,,['Huaneng Power International has switched on ...,pv-magazine,https://www.pv-magazine.com/2022/01/03/worlds-...
3,98158,Iran wants to deploy 10 GW of renewables over ...,2022-03-01 00:00:00,,"['According to the Iranian authorities, there ...",pv-magazine,https://www.pv-magazine.com/2022/01/03/iran-wa...
6,42776,BP: Commitment to Scotland is ‘ unique differe...,2022-03-01 00:00:00,,['BP’ s “ long-term ” commitment to Scotland i...,energyvoice,https://www.energyvoice.com/renewables-energy-...


In [19]:
df_uk = df_uk.drop(columns=['author'])

In [20]:
print(df_uk.isnull().sum())
df_uk.shape

Unnamed: 0    0
title         0
date          0
content       0
domain        0
url           0
dtype: int64


(5820, 6)

# Text Cleaning:
**Preprocessing:** lowercase, delete numbers, punctuation and symbols (#"*!&%), splitting, tokenizing?, removing stopwords, lemmatizing

In [81]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("","", string.punctuation))
    text = ''.join(char for char in text if not char.isdigit())

    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

df_uk['Clean Title'] = df_uk['title'].apply(preprocess_text)
df_uk['Clean Content'] = df_uk['content'].apply(preprocess_text)
df_uk_clean = df_uk.copy()

df_uk_clean = df_uk.drop(columns=['title','content','url', 'Unnamed: 0'])
df_uk_clean.head()

Unnamed: 0,date,domain,Clean Title,Clean Content
0,2022-02-01 00:00:00,cleantechnica,xpeng delivered vehicle,chinese automotive startup xpeng shown one dra...
1,2022-02-01 00:00:00,cleantechnica,green hydrogen drop bucket big splash,sinopec laid plan build largest green hydrogen...
2,2022-03-01 00:00:00,pv-magazine,world’ largest floating pv plant go online chi...,huaneng power international switched mw floati...
3,2022-03-01 00:00:00,pv-magazine,iran want deploy gw renewables next four year ...,according iranian authority currently gw renew...
6,2022-03-01 00:00:00,energyvoice,bp commitment scotland ‘ unique differentiator...,bp’ “ longterm ” commitment scotland key selli...


### Date column needs to be **dtype** format!

In [82]:
print(df_uk_clean['date'].dtype)

object


In [83]:
df_uk_clean['date'] = pd.to_datetime(df_uk_clean['date'], dayfirst=True, errors='coerce')
print(df_uk_clean['date'].dtype)

datetime64[ns]


In [84]:
df_uk_clean.head()

Unnamed: 0,date,domain,Clean Title,Clean Content
0,2022-02-01,cleantechnica,xpeng delivered vehicle,chinese automotive startup xpeng shown one dra...
1,2022-02-01,cleantechnica,green hydrogen drop bucket big splash,sinopec laid plan build largest green hydrogen...
2,2022-03-01,pv-magazine,world’ largest floating pv plant go online chi...,huaneng power international switched mw floati...
3,2022-03-01,pv-magazine,iran want deploy gw renewables next four year ...,according iranian authority currently gw renew...
6,2022-03-01,energyvoice,bp commitment scotland ‘ unique differentiator...,bp’ “ longterm ” commitment scotland key selli...


# Change the column names to match "TheGuardian Dataset":
"Date Published, Clean Title, Clean Article Text, Author/Domain"

In [120]:
df_uk_clean = df_uk_clean.rename(columns={
    'date' : 'Date Published',
    'Clean Content' : 'Clean Article Text',
    'domain' : 'Author_Domain'
    })
df_uk_clean

Unnamed: 0,Date Published,Author_Domain,Clean Title,Clean Article Text,sentiment
0,2022-02-01,cleantechnica,xpeng delivered vehicle,chinese automotive startup xpeng shown one dra...,chinese automotive startup xpeng shown one dra...
1,2022-02-01,cleantechnica,green hydrogen drop bucket big splash,sinopec laid plan build largest green hydrogen...,sinopec laid plan build largest green hydrogen...
2,2022-03-01,pv-magazine,world’ largest floating pv plant go online chi...,huaneng power international switched mw floati...,huaneng power international switched mw floati...
3,2022-03-01,pv-magazine,iran want deploy gw renewables next four year ...,according iranian authority currently gw renew...,according iranian authority currently gw renew...
6,2022-03-01,energyvoice,bp commitment scotland ‘ unique differentiator...,bp’ “ longterm ” commitment scotland key selli...,bp’ “ longterm ” commitment scotland key selli...
...,...,...,...,...,...
20101,2024-10-24,pv-magazine,european commission offer share € billion fund...,european commission offering netzero project s...,european commission offering netzero project s...
20102,2024-10-24,pv-magazine,sunwind energy’ new pv planter combine cultiva...,specializing design custom solar system sunwin...,specializing design custom solar system sunwin...
20108,2024-10-24,pv-magazine,australia gw utilityscale battery construction...,volume largescale battery energy storage proje...,volume largescale battery energy storage proje...
20109,2024-10-24,pv-magazine,residential pv price germany drop within month...,comparison site selfmade energy show addition ...,comparison site selfmade energy show addition ...


# Sentiment Analysis Testing
Let's try different models, starting with the basic from "Your first Transformers Challenge" -- **"twitter-roberta-base-sentiment-latest"**

In [91]:
!pip install transformers torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [109]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [122]:
sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
sentiment_pipeline

sentiment_pipeline("Transformers are awesome!")[0]['label']

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


'positive'

In [121]:
df_uk_clean['sentiment'] = df_uk_clean['Clean Article Text']
df_uk_clean.head(1)

Unnamed: 0,Date Published,Author_Domain,Clean Title,Clean Article Text,sentiment
0,2022-02-01,cleantechnica,xpeng delivered vehicle,chinese automotive startup xpeng shown one dra...,chinese automotive startup xpeng shown one dra...


In [119]:
df_uk_clean['sentiment'] = df_uk_clean['sentiment'].map(lambda x : sentiment_pipeline(x[:500])[0]['label'])
df_uk_clean.head()

KeyboardInterrupt: 

In [79]:
df_uk_clean['sentiment_conf'] = df_uk_clean['sentiment']

In [80]:
df_uk_clean['sentiment_conf'] = df_uk_clean['sentiment_conf'].map(lambda x : x['score'])
df_uk_clean['sentiment'] = df_uk_clean['sentiment'].map(lambda x : x['label'])

TypeError: string indices must be integers

In [65]:
print(df_uk_clean[df_uk_clean['sentiment'] == 'NEGATIVE'].sample(10)['Clean Title'].values)

['weekend read chaotic leadership leaf south african dark – pv magazine international'
 'big solar project negative impact property value say u study – pv magazine international'
 'new technique repair solder interconnection failure solar panel – pv magazine international'
 'afghanistan – pv magazine international'
 'aerial survey explore geothermal potential lower saxony germany'
 'iea pvps – pv magazine international'
 'college team compete wind energy water energy tech — winner announced'
 'jamaican utility launch solarplusstorage wind project tender – pv magazine international'
 'breakthrough producing perovskite solar cell ai – pv magazine international'
 'hemetsberger wage battle gender parity solarpower europe – pv magazine international']


In [58]:
df_uk_clean['sentiment'].value_counts(normalize=True)

sentiment
NEGATIVE    0.618385
POSITIVE    0.381615
Name: proportion, dtype: float64

In [None]:


tokenizer = AutoTokenizer.from_pretrained("fhamborg/roberta-targeted-sentiment-classification-newsarticles")
model = AutoModelForSequenceClassification.from_pretrained("fhamborg/roberta-targeted-sentiment-classification-newsarticles")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at fhamborg/roberta-targeted-sentiment-classification-newsarticles and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.word_embeddings.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention

In [107]:
def analyze_sentiment(text, target):
    text = text[:500]
    # Formatear la entrada como: [CLS] texto [SEP] objetivo [SEP]
    inputs = tokenizer(text, target, return_tensors="pt", truncation=True)
    outputs = model(**inputs)

    # Calcular las probabilidades
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)

    # Obtener la etiqueta de sentimiento
    sentiment = torch.argmax(probs).item()
    labels = ['negative', 'neutral', 'positive']
    return labels[sentiment], probs[0][sentiment].item()

In [108]:
target_entities = ['renewable energy', 'clean technology', 'solar power', 'wind energy', 'electric vehicles', 'renewable energy']

sentiments = []
confidences = []
targets = []
article_texts = []

for idx, row in df_uk_clean.iterrows():
    article_text = row['Clean Article Text']
    for target in target_entities:
        sentiment, confidence = analyze_sentiment(article_text, target)
        sentiments.append(sentiment)
        confidences.append(confidence)
        targets.append(target)
        article_texts.append(article_text)

results_df = pd.DataFrame({
    'Article Text': article_texts,
    'Target Entity': targets,
    'Sentiment': sentiments,
    'Confidence': confidences
})

KeyboardInterrupt: 

In [None]:
results_df.head())