In [4]:
import pandas as pd
import numpy as np
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime


# Dataset Overview

In [5]:
df = pd.read_csv("/Users/enrique/code/EFRdev/08-Final-Project/SolarSoundBytes/raw_data/cleantech_media_dataset_v3_2024-10-28.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
0,93320,"XPeng Delivered ~100,000 Vehicles In 2021",2022-01-02,,['Chinese automotive startup XPeng has shown o...,cleantechnica,https://cleantechnica.com/2022/01/02/xpeng-del...
1,93321,Green Hydrogen: Drop In Bucket Or Big Splash?,2022-01-02,,['Sinopec has laid plans to build the largest ...,cleantechnica,https://cleantechnica.com/2022/01/02/its-a-gre...
2,98159,World’ s largest floating PV plant goes online...,2022-01-03,,['Huaneng Power International has switched on ...,pv-magazine,https://www.pv-magazine.com/2022/01/03/worlds-...
3,98158,Iran wants to deploy 10 GW of renewables over ...,2022-01-03,,"['According to the Iranian authorities, there ...",pv-magazine,https://www.pv-magazine.com/2022/01/03/iran-wa...
4,31128,Eastern Interconnection Power Grid Said ‘ Bein...,2022-01-03,,['Sign in to get the best natural gas news and...,naturalgasintel,https://www.naturalgasintel.com/eastern-interc...


In [6]:
df.shape


(20111, 7)

### Date column needs to be **dtype** format!

In [7]:
df['date'] = pd.to_datetime(df['date'], dayfirst=True, errors='coerce')
print(df['date'].dtype)

datetime64[ns]


In [8]:
unique_domains = df['domain'].unique()
unique_domains

array(['cleantechnica', 'pv-magazine', 'naturalgasintel', 'energyvoice',
       'solarpowerworldonline', 'solarindustrymag', 'thinkgeoenergy',
       'energy-xprt', 'azocleantech', 'pv-tech', 'rechargenews',
       'solarpowerportal.co', 'energyintel', 'greenprophet', 'ecofriend',
       'eurosolar', 'greenairnews', 'all-energy', 'iea', 'decarbxpo',
       'biofuels-news', 'solarquarter', 'storagesummit', 'indorenergy',
       'bex-asia'], dtype=object)

## There are NOT a lot of dates in this dataset! 
Luckly this dataset contains URL column where I can fetch the date in the format "YYYY/MM/DD"

In [9]:
df['date'].isna().sum()


12966

In [10]:
df[df['date'].isna()]['url'].head

<bound method NDFrame.head of 150      https://www.solarpowerportal.co.uk/cfd_portfol...
151      https://www.azocleantech.com/news.aspx?newsID=...
152          https://www.rechargenews.com/news/2-1-1144119
153      https://www.rechargenews.com/energy-transition...
154      https://www.thinkgeoenergy.com/drilling-begins...
                               ...                        
20106    https://www.pv-tech.org/us-treasury-finalises-...
20107    https://www.pv-tech.org/edp-trials-robotic-con...
20108    https://www.pv-magazine.com/2024/10/24/austral...
20109    https://www.pv-magazine.com/2024/10/24/residen...
20110    https://www.pv-magazine.com/2024/10/24/kaust-h...
Name: url, Length: 12966, dtype: object>

In [11]:
def extract_date_from_url(url):
    # Match YYYY/MM/DD pattern
    match = re.search(r'/(\d{4})/(\d{2})/(\d{2})/', url)
    if match:
        try:
            # Format to standard datetime or string
            return datetime.strptime('/'.join(match.groups()), '%Y/%m/%d').date()
        except ValueError:
            return None
    return None

# Apply only to missing dates
df['date'] = df.apply(
    lambda row: extract_date_from_url(row['url']) if pd.isna(row['date']) else row['date'],
    axis=1
)

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
0,93320,"XPeng Delivered ~100,000 Vehicles In 2021",2022-02-01 00:00:00,,['Chinese automotive startup XPeng has shown o...,cleantechnica,https://cleantechnica.com/2022/01/02/xpeng-del...
1,93321,Green Hydrogen: Drop In Bucket Or Big Splash?,2022-02-01 00:00:00,,['Sinopec has laid plans to build the largest ...,cleantechnica,https://cleantechnica.com/2022/01/02/its-a-gre...
2,98159,World’ s largest floating PV plant goes online...,2022-03-01 00:00:00,,['Huaneng Power International has switched on ...,pv-magazine,https://www.pv-magazine.com/2022/01/03/worlds-...
3,98158,Iran wants to deploy 10 GW of renewables over ...,2022-03-01 00:00:00,,"['According to the Iranian authorities, there ...",pv-magazine,https://www.pv-magazine.com/2022/01/03/iran-wa...
4,31128,Eastern Interconnection Power Grid Said ‘ Bein...,2022-03-01 00:00:00,,['Sign in to get the best natural gas news and...,naturalgasintel,https://www.naturalgasintel.com/eastern-interc...


In [13]:
df['date'].isna().sum()

10173

There are still 10173 articles without date... we are going to drop them.

In [14]:
df = df.dropna(subset=['date']).copy()
df.shape

(9938, 7)

### There are **9,938 Articles** that have a date and we can work with!

# Delete or Impute Nulls Values

In [15]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
0,93320,"XPeng Delivered ~100,000 Vehicles In 2021",2022-02-01 00:00:00,,['Chinese automotive startup XPeng has shown o...,cleantechnica,https://cleantechnica.com/2022/01/02/xpeng-del...
1,93321,Green Hydrogen: Drop In Bucket Or Big Splash?,2022-02-01 00:00:00,,['Sinopec has laid plans to build the largest ...,cleantechnica,https://cleantechnica.com/2022/01/02/its-a-gre...
2,98159,World’ s largest floating PV plant goes online...,2022-03-01 00:00:00,,['Huaneng Power International has switched on ...,pv-magazine,https://www.pv-magazine.com/2022/01/03/worlds-...
3,98158,Iran wants to deploy 10 GW of renewables over ...,2022-03-01 00:00:00,,"['According to the Iranian authorities, there ...",pv-magazine,https://www.pv-magazine.com/2022/01/03/iran-wa...
4,31128,Eastern Interconnection Power Grid Said ‘ Bein...,2022-03-01 00:00:00,,['Sign in to get the best natural gas news and...,naturalgasintel,https://www.naturalgasintel.com/eastern-interc...


In [16]:
print(df.isnull().sum())
df.shape

Unnamed: 0       0
title            0
date             0
author        9938
content          0
domain           0
url              0
dtype: int64


(9938, 7)

# Text Cleaning:
**Preprocessing:** lowercase, delete numbers, punctuation and symbols (#"*!&%), splitting, tokenizing?, removing stopwords, lemmatizing

In [17]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))#takes 2 arguments "", "", for string.punctuation
    #text = ''.join(char for char in text if not char.isdigit())

    tokens = text.split()
    #tokens = [word for word in tokens if word not in stop_words]
    #tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

df['Clean Title'] = df['title'].apply(preprocess_text)
df['Clean Content'] = df['content'].apply(preprocess_text)
df_clean = df.copy()

df_clean = df.drop(columns=['title','content', 'Unnamed: 0', "author"])
df_clean.head()

Unnamed: 0,date,domain,url,Clean Title,Clean Content
0,2022-02-01 00:00:00,cleantechnica,https://cleantechnica.com/2022/01/02/xpeng-del...,xpeng delivered 100000 vehicles in 2021,chinese automotive startup xpeng has shown one...
1,2022-02-01 00:00:00,cleantechnica,https://cleantechnica.com/2022/01/02/its-a-gre...,green hydrogen drop in bucket or big splash,sinopec has laid plans to build the largest gr...
2,2022-03-01 00:00:00,pv-magazine,https://www.pv-magazine.com/2022/01/03/worlds-...,world’ s largest floating pv plant goes online...,huaneng power international has switched on a ...
3,2022-03-01 00:00:00,pv-magazine,https://www.pv-magazine.com/2022/01/03/iran-wa...,iran wants to deploy 10 gw of renewables over ...,according to the iranian authorities there are...
4,2022-03-01 00:00:00,naturalgasintel,https://www.naturalgasintel.com/eastern-interc...,eastern interconnection power grid said ‘ bein...,sign in to get the best natural gas news and d...


# Change the column names:
"Date Published, Clean Title, Clean Article Text, Author"

In [18]:
df_clean = df_clean.rename(columns={
    'date' : 'Date Published',
    'Clean Content' : 'Clean Article Text',
    'domain' : 'Author',
    'url' : 'URL'
    })
df_clean.head()

Unnamed: 0,Date Published,Author,URL,Clean Title,Clean Article Text
0,2022-02-01 00:00:00,cleantechnica,https://cleantechnica.com/2022/01/02/xpeng-del...,xpeng delivered 100000 vehicles in 2021,chinese automotive startup xpeng has shown one...
1,2022-02-01 00:00:00,cleantechnica,https://cleantechnica.com/2022/01/02/its-a-gre...,green hydrogen drop in bucket or big splash,sinopec has laid plans to build the largest gr...
2,2022-03-01 00:00:00,pv-magazine,https://www.pv-magazine.com/2022/01/03/worlds-...,world’ s largest floating pv plant goes online...,huaneng power international has switched on a ...
3,2022-03-01 00:00:00,pv-magazine,https://www.pv-magazine.com/2022/01/03/iran-wa...,iran wants to deploy 10 gw of renewables over ...,according to the iranian authorities there are...
4,2022-03-01 00:00:00,naturalgasintel,https://www.naturalgasintel.com/eastern-interc...,eastern interconnection power grid said ‘ bein...,sign in to get the best natural gas news and d...


# Sentiment Analysis Testing
Let's try different models, starting with the basic from "Your first Transformers Challenge" -- **"twitter-roberta-base-sentiment-latest"**

In [19]:
!pip install transformers torch
!pip install datasets
!pip install transformers[torch]


python(37488) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.




python(37489) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


zsh:1: no matches found: transformers[torch]


python(37507) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [20]:
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from collections import Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset, ClassLabel
from transformers import TrainingArguments
from transformers import Trainer


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/enrique/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Setting up the model

In [21]:
sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
df_sample = df_clean.sample(n=100, random_state=42).copy()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [25]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
sentiment_pipeline

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


<transformers.pipelines.text_classification.TextClassificationPipeline at 0x317220e50>

We need to devide full articles into ***chunks*** of a **N** of sentences.

In [42]:
def split_into_chunks(text, max_sentences=5):
    "Devide a text in chunks of N sentences"
    sentences = sent_tokenize(text)
    return [' '.join(sentences[i:i+max_sentences]) for i in range(0, len(sentences), max_sentences)]


In [43]:

def analyze_sentiment_chunked(text):
    "Analyse sentiment of chunks and labels 'mixed' if there is a meaningful draw"
    try:
        chunks = split_into_chunks(text[:500])
        results = [sentiment_pipeline(chunk, truncation=True)[0] for chunk in chunks] # Truncar textos muy largos en chunk por chunk

        labels = [r['label'] for r in results]
        scores = [r['score'] for r in results]

        counter = Counter(labels)
        majority_label, count = counter.most_common(1)[0]
        avg_score = sum([s for l, s in zip(labels, scores) if l == majority_label]) / count

        if avg_score < 0.4:
            majority_label = 'NEUTRAL'

        return pd.Series([majority_label, avg_score])
    except Exception as e:
        print(f"Error en analyze_sentiment_chunked: {e}")
        return pd.Series([None, None])

In [44]:
df_sample[['sentiment', 'sentiment_conf']] = df_sample['Clean Article Text'].apply(analyze_sentiment_chunked)
df_sample.head()
#df_sample.iloc[0]['Clean Article Text']

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,Date Published,Author,URL,Clean Title,Clean Article Text,sentiment,sentiment_conf
19871,2024-04-10 00:00:00,solarpowerworldonline,https://www.solarpowerworldonline.com/2024/10/...,energy storage set to be ineligible for lowinc...,on october 3 the solar energy industries assoc...,neutral,0.713261
18081,2024-05-28,pv-magazine,https://www.pv-magazine.com/2024/05/28/brazils...,brazil’ s new pv additions hit 6 gw in january...,brazil added 6 gw of new pv capacity between j...,neutral,0.789715
2873,2022-08-30,cleantechnica,https://cleantechnica.com/2022/08/30/tesla-won...,tesla won big in california in 2nd quarter,yesterday we looked at the tesla model y’ s an...,neutral,0.767114
5967,2023-03-23,cleantechnica,https://cleantechnica.com/2023/03/23/freewire-...,freewire helps road ranger get ev charging sta...,installing ev fast chargers is an expensive pr...,negative,0.599376
7016,2023-04-05 00:00:00,azocleantech,https://www.azocleantech.com/book.aspx?SaleID=118,biomass gasification pyrolysis and torrefaction,by clicking allow all you agree to the storing...,positive,0.649786


In [45]:
df_sample['sentiment'].value_counts()

sentiment
neutral     61
positive    31
negative     8
Name: count, dtype: int64

In [46]:
print(df_sample[df_sample['sentiment'] == 'negative'].sample()['Clean Article Text'].values)

['four years on from the birth of its solar boom vietnam’ s largescale solar industry has been stunted by a lack of generic power purchase agreements ppa while the rooftop sector through strong in the commercial and industrial c i segment also faces hindrances from government the main growth in gridconnected pv took place between 2018 and 2020 shooting well past the government’ s initial expectations by a factor of four or more but the rug has been pulled out from under the feet of the bold developers who plunged into the southeast asian market despite all its welldocumented bankability risks since the boom there have been some retroactive forces to limit solar development which is understandable to some extent says gavin smith director of ukbased consultancy clean energy advisors who specialises in vietnam the sudden advancements caused considerable technical financial and political problems for the government adding stress to its management of energy markets mobilising enough capital

# Fine-Tuning

In [47]:
df_train = pd.read_csv('/Users/enrique/code/EFRdev/08-Final-Project/SolarSoundBytes/raw_data/ForTraining_news_sentiment_analysis.csv')
df_train.head(2)

Unnamed: 0,Source,Author,Title,Description,URL,Published At,Sentiment,Type
0,stgnews,Bridger Palmer,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",https://www.stgeorgeutah.com/news/archive/2024...,2024-07-12T23:45:25+00:00,positive,Business
1,Zimbabwe Mail,Staff Reporter,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",https://www.thezimbabwemail.com/business/busin...,2024-07-12T22:59:42+00:00,neutral,Business


In [48]:
df_train.shape

(3500, 8)

In [49]:
df_train['Published At'].dtype

dtype('O')

In [50]:
#df_train['Published At'] = pd.to_datetime(df_train['Published At'], dayfirst=True, errors='coerce')
df_train['Published At'] = df_train['Published At'].astype(str)


In [51]:
df_train.columns = df_train.columns.str.strip()
print(df_train.columns)

Index(['Source', 'Author', 'Title', 'Description', 'URL', 'Published At',
       'Sentiment', 'Type'],
      dtype='object')


In [58]:
label_list = ['negative', 'neutral', 'positive']
label_to_id = {l: i for i, l in enumerate(label_list)}
df_train['label_id'] = df_train['Sentiment'].map(label_to_id)

#Turning Dataset into HuggingFace object
dataset = Dataset.from_pandas(df_train)


In [60]:
print(tokenized_dataset.column_names)

['Source', 'Author', 'Title', 'Description', 'URL', 'Published At', 'Sentiment', 'Type', 'label_id', 'input_ids', 'attention_mask']


In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_list))

#Tokenizing
def tokenize_function(text):
    return tokenizer(text["Description"], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

#Split dataset train and val
split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split['train']
val_dataset = split['test']

#Evaluation Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

#Training set up & Training
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
)

trainer = Trainer(
    model=model, args = training_args, train_dataset=train_dataset, eval_dataset = val_dataset, compute_metrics=compute_metrics)
trainer.train()


NameError: name 'AutoTokenizer' is not defined

In [269]:
import transformers
print(transformers.__file__)

/Users/enrique/.pyenv/versions/3.10.6/envs/SolarSoundBytes/lib/python3.10/site-packages/transformers/__init__.py


In [268]:
import transformers
print(transformers.__version__)

4.52.4


In [49]:
import accelerate
print(accelerate.__version__)


1.7.0


In [50]:
import sys
print(sys.executable)

/Users/enrique/.pyenv/versions/3.10.6/envs/SolarSoundBytes/bin/python


In [None]:
trainer.save_model("./modelo_finetuned")
tokenizer.save_pretrained("./modelo_finetuned")