# imports

In [1]:
import pandas as pd
import numpy as np
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime

# clean data

In [2]:
# df = pd.read_csv("../../data/csv/twitter_scraped_df.csv", nrows=100)
df = pd.read_csv("../../data/csv/twitter_scraped_df.csv")
print(df.shape)
df.head(2)

(27610, 31)


Unnamed: 0,createdAt,id,url,text,source,date,time of day,location,retweetCount,replyCount,...,user_mentions_indices_0,user_mentions_indices_1,user_mentions_name,user_mentions_screen_name,reply_to_user_results,quoted_tweet_results,quoted_tweet,retweeted_tweet,isConversationControlled,searchTermIndex
0,2022-01-02 00:00:46+00:00,1477429624208564226,https://x.com/KlausRieneck/status/147742962420...,The decision to phase out nuclear power and sh...,,2022-01-02,0.0,,0.0,0.0,...,,,,,"{'rest_id': '891466309619380224', 'result': {'...",,,,False,0.0
1,2022-01-02 00:00:50+00:00,1477429640675446785,https://x.com/EINRenewables/status/14774296406...,"EU Moves to Label Nuclear, Natural Gas Energy ...",,2022-01-02,0.0,,0.0,0.0,...,,,,,,,,,False,0.0


## ensure **createdAt** in **dtype** format

In [3]:
df['createdAt'] = pd.to_datetime(df['createdAt'], dayfirst=True, errors='coerce')
print(df['createdAt'].dtype)
print(df['createdAt'][0:2])

datetime64[ns, UTC]
0   2022-02-01 00:00:46+00:00
1   2022-02-01 00:00:50+00:00
Name: createdAt, dtype: datetime64[ns, UTC]


## extract author from url and insert as new column after 'id'

In [None]:
# extract author from url using regex
# https://x.com/EINRenewables/status/14774296406
# https://x.com/KlausRieneck/status/147742962420
df['author'] = df['url'].str.extract(r'x\.com/([^/]+)/status')

In [4]:
# move 'author' column after 'id'
cols = list(df.columns)
if 'author' in cols:
    author_idx = cols.index('id') + 1
    cols.insert(author_idx, cols.pop(cols.index('author')))
    df = df[cols]
# author_idx = cols.index('id') + 1
# cols.insert(author_idx, cols.pop(cols.index('author')))
# df = df[cols]
# df.head(2)

## drop columns if only contain 1 unique value or if all values are NaN

In [5]:
columns = df.columns
for col in columns:
    if df[col].nunique(dropna=False) == 1 or df[col].isna().all():
        df = df.drop(columns=[col])
df.head(2)

Unnamed: 0,createdAt,id,url,text,date,time of day,retweetCount,replyCount,likeCount,quoteCount,...,user_mentions_id_str,user_mentions_indices_0,user_mentions_indices_1,user_mentions_name,user_mentions_screen_name,reply_to_user_results,quoted_tweet_results,quoted_tweet,isConversationControlled,searchTermIndex
0,2022-02-01 00:00:46+00:00,1477429624208564226,https://x.com/KlausRieneck/status/147742962420...,The decision to phase out nuclear power and sh...,2022-01-02,0.0,0.0,0.0,0.0,0.0,...,,,,,,"{'rest_id': '891466309619380224', 'result': {'...",,,False,0.0
1,2022-02-01 00:00:50+00:00,1477429640675446785,https://x.com/EINRenewables/status/14774296406...,"EU Moves to Label Nuclear, Natural Gas Energy ...",2022-01-02,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,False,0.0


## get unique tweet authors

In [None]:
unique_authors = df['author'].dropna().unique()
print(unique_authors[0:5])

KeyError: 'author'

# preprocess

## focus on tweet content, ignore twitter metrics columns for now

In [None]:
df = df[['createdAt', 'id', 'author', 'url', 'text']]

## check for missing values --> no need to delete or impute 😄

In [None]:
print(df.isnull().sum())
df.shape

## clean text

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
df.head(2)

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    '''
    # remove whitespace
    # lowercase
    # remove hyperlinks
    # punctuation and symbols like #"*!&%
    '''
    # remove whitespace
    text = text.strip(' ')

    # lowercase
    text = text.lower()

    # use regex to remove hyperlinks starting with http
    text = re.sub(r'http\S+', '', text)

    # remove punctuation and symbols like #"*!&%
    text = text.translate(str.maketrans('', '', string.punctuation))

    # tokenize
    tokens = word_tokenize(text)

    # sentiment analysis --> keep stopwords
    # to identify difference between 'not happy' and the separate words 'not', 'happy'

    # lemmatize to group words by their meaning instead of their exact form
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    lemmatized_string = ' '.join(lemmatized_words)  # join back to string

    return lemmatized_string

df['text_clean'] = df['text'].apply(preprocess_text)

pd.set_option('display.max_colwidth', None)
print(df['text'][0:2])
print(df['text_clean'][0:2])

# Change the column names:

In [None]:
# df_clean = df_clean.rename(columns={
#     'date' : 'Date Published',
#     'Clean Content' : 'Clean Article Text',
#     'domain' : 'Author',
#     'url' : 'URL'
#     })
# df_clean.head()

# Sentiment Analysis Testing
Let's try different models, starting with the basic from "Your first Transformers Challenge" -- **"twitter-roberta-base-sentiment-latest"**

In [None]:
%pip install transformers torch
%pip install datasets

# %pip install transformers[torch]
%pip install torch

In [None]:
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from collections import Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset, ClassLabel
from transformers import TrainingArguments
from transformers import Trainer


Setting up the model

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
df_sample = df['text', 'text_clean'].sample(n=100, random_state=42).copy()

In [None]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
sentiment_pipeline

divide text into ***chunks*** of a **N** of sentences.

In [None]:
def split_into_chunks(text, max_sentences=5):
    "Devide a text in chunks of N sentences"
    sentences = sent_tokenize(text)
    return [' '.join(sentences[i:i+max_sentences]) for i in range(0, len(sentences), max_sentences)]


In [None]:

def analyze_sentiment_chunked(text):
    "Analyse sentiment of chunks and labels 'mixed' if there is a meaningful draw"
    try:
        chunks = split_into_chunks(text[:500])
        results = [sentiment_pipeline(chunk, truncation=True)[0] for chunk in chunks] # Truncar textos muy largos en chunk por chunk

        labels = [r['label'] for r in results]
        scores = [r['score'] for r in results]

        counter = Counter(labels)
        majority_label, count = counter.most_common(1)[0]
        avg_score = sum([s for l, s in zip(labels, scores) if l == majority_label]) / count

        if avg_score < 0.4:
            majority_label = 'NEUTRAL'

        return pd.Series([majority_label, avg_score])
    except Exception as e:
        print(f"Error en analyze_sentiment_chunked: {e}")
        return pd.Series([None, None])

In [None]:
print(df_sample['text_clean'].apply(analyze_sentiment_chunked))

In [None]:
df_sample['sentiment'].value_counts()

In [None]:
print(df_sample[df_sample['sentiment'] == 'negative'].sample()['Clean Article Text'].values)

# Fine-Tuning

In [None]:
df_train = pd.read_csv('/Users/enrique/code/EFRdev/08-Final-Project/SolarSoundBytes/raw_data/ForTraining_news_sentiment_analysis.csv')
df_train.head(2)

In [None]:
df_train.shape

In [None]:
df_train['Published At'].dtype

In [None]:
#df_train['Published At'] = pd.to_datetime(df_train['Published At'], dayfirst=True, errors='coerce')
df_train['Published At'] = df_train['Published At'].astype(str)


In [None]:
df_train.columns = df_train.columns.str.strip()
print(df_train.columns)

In [None]:
label_list = ['negative', 'neutral', 'positive']
label_to_id = {l: i for i, l in enumerate(label_list)}
df_train['label_id'] = df_train['Sentiment'].map(label_to_id)

#Turning Dataset into HuggingFace object
dataset = Dataset.from_pandas(df_train)


In [None]:
print(tokenized_dataset.column_names)

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_list))

#Tokenizing
def tokenize_function(text):
    return tokenizer(text["Description"], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

#Split dataset train and val
split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split['train']
val_dataset = split['test']

#Evaluation Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

#Training set up & Training
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
)

trainer = Trainer(
    model=model, args = training_args, train_dataset=train_dataset, eval_dataset = val_dataset, compute_metrics=compute_metrics)
trainer.train()


In [None]:
import transformers
print(transformers.__file__)

In [None]:
import transformers
print(transformers.__version__)

In [None]:
import accelerate
print(accelerate.__version__)


In [None]:
import sys
print(sys.executable)

In [None]:
trainer.save_model("./modelo_finetuned")
tokenizer.save_pretrained("./modelo_finetuned")