##### **Before starting ...**

Checking we are using **frugal-notebooks-env** conda environment

In [1]:
!which python

/home/avm/Workspace/octo/frugal-ai-challenge/.venv/bin/python


Checking the python version is 3.9 (compatibility with frugal AI codebase)

In [2]:
!python --version

Python 3.10.6


# LSTM

## 1. **Dataset Exploration**

In [3]:
import numpy as np
import pandas as pd

**Loading the dataset**

In [4]:
df = pd.read_parquet("hf://datasets/QuotaClimat/frugalaichallenge-text-train/train.parquet")

In [5]:
df.head()

Unnamed: 0,quote,label,source,url,language,subsource,id
0,"There is clear, compelling evidence that many ...",5_science_unreliable,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,CARDS,
1,"For most of the Holocene (last 10k years), sea...",1_not_happening,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,hamburg_test1,
2,"China, which hosts U.N. climate talks next wee...",4_solutions_harmful_unnecessary,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,CARDS,
3,And the fabricated documents (which Dr. Mann a...,0_not_relevant,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,CARDS,
4,It's going to be 42 here today and the hottest...,1_not_happening,FLICC,https://huggingface.co/datasets/fzanartu/FLICC...,en,hamburg_test3,


In [6]:
df = df[['quote', 'label']]
df.head()

Unnamed: 0,quote,label
0,"There is clear, compelling evidence that many ...",5_science_unreliable
1,"For most of the Holocene (last 10k years), sea...",1_not_happening
2,"China, which hosts U.N. climate talks next wee...",4_solutions_harmful_unnecessary
3,And the fabricated documents (which Dr. Mann a...,0_not_relevant
4,It's going to be 42 here today and the hottest...,1_not_happening


**The label have 8 classes**

In [7]:
print('unique values:')
print('label\n', df['label'].unique())

unique values:
label
 ['5_science_unreliable' '1_not_happening'
 '4_solutions_harmful_unnecessary' '0_not_relevant' '6_proponents_biased'
 '7_fossil_fuels_needed' '2_not_human' '3_not_bad']


**But we have a bit of unbalanced classes (that we will have to take care of)**

In [8]:
class_distribution = df['label'].value_counts().sort_index()
df['label'].value_counts(normalize=True).sort_index()

label
0_not_relevant                     0.265638
1_not_happening                    0.121655
2_not_human                        0.115252
3_not_bad                          0.063372
4_solutions_harmful_unnecessary    0.127073
5_science_unreliable               0.131505
6_proponents_biased                0.128386
7_fossil_fuels_needed              0.047119
Name: proportion, dtype: float64

In [9]:
import plotly.express as px

fig = px.bar(x=class_distribution.index, y=class_distribution.values,
             labels={'x': 'Label', 'y': 'Count'})
fig.update_layout(width=900, height=400, title="Class Distribution")
fig.show(renderer='iframe')

**We will need to crop some quotes**

In [10]:
df['text_length'] = df['quote'].str.len()
print(df['text_length'].describe())

count    6091.000000
mean      293.528485
std       258.330755
min        19.000000
25%       139.000000
50%       228.000000
75%       365.000000
max      4703.000000
Name: text_length, dtype: float64


In [11]:
fig = px.histogram(df, x='text_length', nbins=50)
fig.update_layout(width=800, height=300, title="Sentence Length Distribution")
fig.show(renderer='iframe')

In [12]:
threshold = np.percentile(df['text_length'], 95)
threshold

np.float64(722.5)

In [13]:
long_quotes = df[df['text_length'] > threshold]
print('Number of quotes above threshold: ', long_quotes.shape[0])

Number of quotes above threshold:  305


## **2. Preprocessing**

### **Features and target**

In [14]:
X = df['quote']
y = df['label']
X.shape, y.shape, type(X), type(y)

((6091,), (6091,), pandas.core.series.Series, pandas.core.series.Series)

In [15]:
X.head()

0    There is clear, compelling evidence that many ...
1    For most of the Holocene (last 10k years), sea...
2    China, which hosts U.N. climate talks next wee...
3    And the fabricated documents (which Dr. Mann a...
4    It's going to be 42 here today and the hottest...
Name: quote, dtype: object

In [16]:
y.head()

0               5_science_unreliable
1                    1_not_happening
2    4_solutions_harmful_unnecessary
3                     0_not_relevant
4                    1_not_happening
Name: label, dtype: object

### **Cleaning**

In [17]:
import string
import re
import contractions
import html
import unicodedata
from wordsegment import load, segment

load()

def segment_unseparated(sentence, threshold=20):
    return " ".join(
        word if len(word) > threshold else " ".join(segment(word))
        for word in sentence.split()
    )

def basic_cleaning(sentence:str, threshold:int=15):
    sentence = html.unescape(sentence)                        # convert html entities
    sentence = re.sub(r'http\S+|www\S+', '', sentence)        # remove URLs
    sentence = unicodedata.normalize('NFKC', sentence)        # normalize Unicode
    sentence = sentence.encode("ascii", "ignore").decode()    # remove non ASCII
    sentence = contractions.fix(sentence)                     # expand contractions
    sentence = sentence.lower()                               # lowercase
    sentence = re.sub(r'\d+', '', sentence)                   # remove digits
    punctuation = string.punctuation.replace("-", "")         # keep hyphens
    translator = str.maketrans('', '', punctuation)
    sentence = sentence.translate(translator)                 # remove punctuation
    sentence = re.sub(r'\s-\s', ' ', sentence)                # removes " - " (hyphen surrounded by spaces)
    sentence = re.sub(r'\s-', ' ', sentence)                  # removes leading hyphens
    sentence = re.sub(r'-\s', ' ', sentence)                  # removes trailing hyphens
    sentence = segment_unseparated(sentence, threshold=threshold) # separate words that should be separated
    sentence = ' '.join(sentence.split()).strip()             # remove whitespace
    return sentence

In [18]:
X_clean = X.apply(basic_cleaning)

In [19]:
X_clean.shape, type(X_clean)

((6091,), pandas.core.series.Series)

In [20]:
X_clean.head()

0    there is clear compelling evidence that many o...
1    for most of the holocene last k years sea leve...
2    china which hosts you n climate talks next wee...
3    and the fabricated documents which dr mann app...
4    it is going to be here today and the hottest s...
Name: quote, dtype: object

### **Preprocessing**
- tokenize
- stopwords
- lemmatize
- rejoin

In [21]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# nltk_path = "/Users/a.villa.massone/miniconda3/envs/frugal-notebooks-env/nltk_data" , download_dir=nltk_path
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

stop_words = set(stopwords.words('english'))
    
def preproc(sentence):
    tokens = word_tokenize(sentence)
    tokens = [w for w in tokens if w.lower() not in stop_words]              # remove stop_words
    tokens = [WordNetLemmatizer().lemmatize(w, pos = "v") for w in tokens]   # lemmatize
    return " ".join(tokens)

[nltk_data] Downloading package punkt_tab to /home/avm/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/avm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/avm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /home/avm/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [22]:
X_preproc = X_clean.apply(preproc)

In [23]:
X_preproc.shape, type(X_preproc)

((6091,), pandas.core.series.Series)

In [24]:
X_preproc.head()

0    clear compel evidence many major conclusions i...
1    holocene last k years sea level rise rate arou...
2    china host n climate talk next week first time...
3    fabricate document dr mann apparently still th...
4    go today hottest summer record iirc sure globa...
Name: quote, dtype: object

In [26]:
# vocabulary = vectorizer.get_feature_names_out()

# word_occurrences = (X_vectorized > 0).sum(axis=0)
# word_count = dict(zip(vocabulary, np.asarray(word_occurrences).flatten()))

# print(f"\nVocabulary size: {len(vocabulary)}")
# print()

# from nltk.corpus import words
# english_vocab = set(words.words())
# oov_words = [word for word in vocabulary if word.lower() not in english_vocab]
# print(f"Out-of-vocabulary words: {len(oov_words)}")
# # X_vectorized.head(3)
# # print('feature shape:', X_vectorized.shape)

### **Checking quality of preprocessing**

In [None]:
# non alphanum
alphanum_error = [word for word in vocabulary if re.search(r"[^a-zA-Z0-9'-]", word)]
print("preprocessing alphanum errors:", alphanum_error[:20])
print('---------------')

# case-sensitive duplicates
vocab_lower = set(word.lower() for word in vocabulary)
if len(vocab_lower) != len(vocabulary):
    print("Warning: Vocabulary contains case-sensitive duplicates!")
print('---------------')

# short words and long words
short_words = [word for word in vocabulary if len(word) <= 2]
long_words = [word for word in vocabulary if len(word) > 15]

print('Longest and shortest words:')
print(len(short_words), "short words (≤2 chars):\n", short_words[:20])
print(len(long_words), "long words (>15 chars):\n", long_words[:20])
print('---------------')

# most / least frequent
from collections import Counter

word_count_counter = Counter(word_count)
print("Most common words:\n", word_count_counter.most_common(10))
print("Least common words:\n", word_count_counter.most_common()[-10:])
print('---------------')

# out of vocabulary - not english
#nltk.download('words', download_dir=nltk_path)
from nltk.corpus import words

english_vocab = set(words.words())

oov_words = [word for word in vocabulary if word.lower() not in english_vocab]
print(f"Out-of-vocabulary words: {len(oov_words)}")
print("Sample OOV words:", oov_words[:20])
print('---------------')

In [None]:
long_words = [word for word in vocabulary if len(word) > 15 and len(word) < 20]
test_df = pd.DataFrame({
    'word' : long_words,
    'length': [len(word) for word in long_words],
    'separated' : [segment_unseparated(word) for word in long_words]
} )
# test_df.sort_values(by='length', ascending=False)


**Let's have a look at the most frequent words**

In [None]:
#word_count_counter

In [None]:
import matplotlib.pyplot as plt

top_20_words = word_count_counter.most_common(40)
df_word_freq = pd.DataFrame(top_20_words, columns=['Word', 'Frequency'])

plt.figure(figsize=(12, 6))
plt.bar(df_word_freq['Word'], df_word_freq['Frequency'])
plt.title('Top 40 Most Frequent Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

total_words = sum(word_count_counter.values())
unique_words = len(vocabulary)
average_freq = total_words / unique_words

print(f"\nTotal words: {total_words}")
print(f"Unique words: {unique_words}")
print(f"Average word frequency: {average_freq:.2f}")
print()
print()

### **Encode target**

In [30]:
from sklearn.preprocessing import LabelEncoder

def encode_target(y):
    le = LabelEncoder()
    y_cat = le.fit_transform(y)
    return y_cat

In [None]:
y_cat = encode_target(y)

print(le.classes_)
print()
print('y_cat :', y_cat)

## **3. Baseline**

### **Feature and target**

In [None]:
X_vectorized

In [None]:
y_cat

In [None]:
X_vectorized.shape, y_cat.shape, type(X_vectorized), type(y_cat)

**Train test split**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_cat, test_size=0.2, stratify=y_cat, random_state=42)

print('types', type(X_train), type(X_test), type(y_train), type(y_test))
'Shapes', X_train.shape, X_test.shape, y_train.shape, y_test.shape

### **Resampling for unbalanced classes**

In [None]:
print(pd.Series(y_train).value_counts(normalize=True))
print(pd.Series(y_train).value_counts())

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

def resampling(X, y):
    y = pd.Series(y)

    mean_count = int(max(y.value_counts()) / 2)
    smote_strategy = {
        label: mean_count if count < mean_count else count
        for label, count in y.value_counts().items()
    }
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    undersample_strategy = {label: mean_count for label in y.value_counts().keys()}
    undersample = RandomUnderSampler(sampling_strategy=undersample_strategy, random_state=42)
    X_resampled, y_resampled = undersample.fit_resample(X_resampled, y_resampled)

    return X_resampled, y_resampled

X_train, y_train = resampling(X_train, y_train)

print(pd.Series(y_train).value_counts(normalize=False))

**Check the category split**

In [None]:
total_category_counts = pd.DataFrame(y).value_counts().sort_index()
train_category_counts = pd.DataFrame(y_train).value_counts().sort_index()
test_category_counts = pd.DataFrame(y_test).value_counts().sort_index()

total_proportions = pd.DataFrame(y).value_counts(normalize=True).sort_index().round(2)
train_proportions = pd.DataFrame(y_train).value_counts(normalize=True).sort_index().round(2)
test_proportions = pd.DataFrame(y_test).value_counts(normalize=True).sort_index().round(2)

category_distribution_df = pd.DataFrame({
    "total": total_category_counts.values,
    "train": train_category_counts.values,
    "test": test_category_counts.values,
    "total%": total_proportions.values,
    "train%": train_proportions.values,
    "test%": test_proportions.values
}, index=total_category_counts.index)

print(category_distribution_df)

### 

### **Model : LSTM**

In [27]:
df = pd.read_parquet("hf://datasets/QuotaClimat/frugalaichallenge-text-train/train.parquet")

In [28]:
X = df['quote']
y = df['label']

In [31]:
y_cat = encode_target(y)

In [32]:
X_clean = X.apply(basic_cleaning)
X_preproc = X_clean.apply(preproc)
X_clean.shape, X_preproc.shape

((6091,), (6091,))

In [None]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X_preproc, y_cat, test_size=0.2, random_state=42)

In [34]:
from tensorflow.keras.preprocessing.text import Tokenizer
from collections import Counter

num_words = 15000
tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_tokenized = tokenizer.texts_to_sequences(X)
# X_train_tokenized = tokenizer.texts_to_sequences(X_train)
# X_test_tokenized = tokenizer.texts_to_sequences(X_test)
len(X_tokenized)

6091

In [35]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 750

X_pad = pad_sequences(X_tokenized, dtype='float32', padding='post', truncating='post', value=0.0)
# X_train_pad = pad_sequences(X_train_tokenized, dtype='float32', padding='post', truncating='post', value=0.0)
# X_test_pad = pad_sequences(X_test_tokenized, dtype='float32', padding='post', truncating='post', value=0.0)
X_pad.shape

(6091, 790)

In [36]:
print(f'Sentences : {X.shape[0]}')
print(f'Original pre-cleaning vocab : {len(set([w for s in X for w in s.split()]))}')
print(f'Original pre-cleaning max tokens : {max([len(s) for s in X])}')

print('\n->Cleaning :\n\tconvert html entities, \n\tremove URLs, \n\tnormalize Unicode, \n\tremove non ASCII, \n\texpand contractions, \n\tlowercase, \n\tremove digits, \n\tkeep hyphens, \n\tseparate stuckup words, \n\tremove whitespace')
print(f'Sentences : {X_clean.shape[0]}')
print(f'Original post-cleaning vocab : {len(set([w for s in X_clean for w in s.split()]))}')
print(f'Original post-cleaning max tokens : {max([len(s) for s in X_clean])}')

print('\n->Preprocessing :\n\tremove stop_words, \n\tlemmatize')
print(f'Sentences : {X_preproc.shape[0]}')
print(f'Original post-preproc vocab : {len(set([w for s in X_preproc for w in s.split()]))}')
print(f'Original post-preproc max tokens : {max([len(s) for s in X_preproc])}')

print('\n->Tokenizing')
print(f'Original vocab : {len(tokenizer.word_index)}')
print(f'Actual vocab included : {len(set([w for s in X_tokenized for w in s]))}')
print(f'Max tokens : {max([len(s) for s in X_tokenized])}')

print('\n->Padding')
print(f'Actual vocab included : {len(set([w for s in X_pad for w in s]))}')
print(f'Max tokens : {max([len(s) for s in X_pad])}')

Sentences : 6091
Original pre-cleaning vocab : 30807
Original pre-cleaning max tokens : 4703

->Cleaning :
	convert html entities, 
	remove URLs, 
	normalize Unicode, 
	remove non ASCII, 
	expand contractions, 
	lowercase, 
	remove digits, 
	keep hyphens, 
	separate stuckup words, 
	remove whitespace
Sentences : 6091
Original post-cleaning vocab : 15507
Original post-cleaning max tokens : 4571

->Preprocessing :
	remove stop_words, 
	lemmatize
Sentences : 6091
Original post-preproc vocab : 12259
Original post-preproc max tokens : 2958

->Tokenizing
Original vocab : 18587
Actual vocab included : 14999
Max tokens : 790

->Padding
Actual vocab included : 15000
Max tokens : 790


In [37]:
# Counter([w for s in X_tokenized for w in s])
# dict(sorted(tokenizer.word_counts.items(), key=lambda item: item[1], reverse=True))
# dict(sorted(tokenizer.word_docs.items(), key=lambda item: item[1], reverse=True))

In [38]:
# import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, InputLayer, Embedding

def model_init(vocab_size, embedding_dimension):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size + 1, output_dim=embedding_dimension, mask_zero=True))
    model.add(LSTM(50, activation='tanh'))
    model.add(Dense(8, activation='softmax'))
    
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [39]:
from tensorflow.keras.callbacks import EarlyStopping

def model_fit(model, X_train, y_train):
    
    es = EarlyStopping(patience=2, restore_best_weights=True)
    
    history = model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=32,
        validation_split=0.2,
        callbacks = [es],
        verbose=1
    )
    
    return model, history

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_pad, y_cat, test_size=0.2, random_state=42)

In [41]:
vocab_size = num_words
embedding_dimension = 20
# max_len = 

baseline_model = model_init(vocab_size, embedding_dimension)

2025-02-01 17:22:38.824887: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [42]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4872, 790), (1219, 790), (4872,), (1219,))

In [43]:
baseline_model , history = model_fit(baseline_model, X_train, y_train)

Epoch 1/100


ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None,), output.shape=(None, 8)

In [None]:
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")
predictions = model.predict(X_test)

**Baseline : Cross validation**

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=41)
baseline_accuracy = cross_val_score(baseline_model, X_train, y_train, cv=skf).mean()

print("Baseline")
print(f"Cross-validation accuracy on 5 folds: {round(baseline_accuracy, 2)}")

## 5. **Efficiency metrics : Tracking energy consumption**

### **During training**

In [None]:
X_train.shape, y_train.shape

In [None]:
from codecarbon import EmissionsTracker

def monitor_training(model, X_train, y_train):
    tracker = EmissionsTracker(log_level="error")
    tracker.start()

    model.fit(X_train, y_train)

    training_emissions = tracker.stop()
    training_time = tracker.final_emissions_data.duration
    training_energy_conso = tracker.final_emissions_data.energy_consumed

    train_efficiency_metrics = {
        "total_latency_sec": training_time,
        "sample_latency_sec": training_time / X_train.shape[0],
        "total_energy_conso_kWh": training_energy_conso,
        "sample_energy_conso_kWh": training_energy_conso / X_train.shape[0],
        "total_emissions_kgCO₂eq": training_emissions,
#        "tracker_data": tracker.final_emissions_data
    }
    train_efficiency_metrics_df = pd.DataFrame.from_dict(train_efficiency_metrics, columns=['metrics'], orient='index')

    return model, train_efficiency_metrics_df

baseline_model, train_efficiency_metrics_df = monitor_training(baseline_model, X_train, y_train)

print(train_efficiency_metrics_df)
print()

### **During inference**

In [None]:
X_test.shape

In [None]:
from codecarbon import EmissionsTracker

def monitor_inference(model, X_test):
    tracker = EmissionsTracker(log_level="error")
    tracker.start()

    y_pred = model.predict(X_test)

    inference_emissions = tracker.stop()
    inference_time = tracker.final_emissions_data.duration
    inference_energy_conso = tracker.final_emissions_data.energy_consumed

    inf_efficiency_metrics = {
        "total_latency_sec": inference_time,
        "sample_latency_sec": inference_time / X_test.shape[0],
        "total_energy_conso_kWh": inference_energy_conso,
        "sample_energy_conso_kWh": inference_energy_conso / X_test.shape[0],
        "total_emissions_kgCO₂eq": inference_emissions,
#        "tracker_data": tracker.final_emissions_data
    }
    inf_efficiency_metrics_df = pd.DataFrame.from_dict(inf_efficiency_metrics, columns=['metrics'], orient='index')

    return y_pred, inf_efficiency_metrics_df

y_pred, inf_efficiency_metrics_df = monitor_inference(baseline_model, X_test)

print(inf_efficiency_metrics_df)
print()

### **Summary table**

In [None]:
df_combined = pd.concat([train_efficiency_metrics_df, inf_efficiency_metrics_df], axis=1)
df_combined.columns = ['training', 'inference']
df_combined

## 4. **Performance metrics**

**Metrics**  
evaluation with :  
- accuracy  

monitor:  
- class_accuracy
- precision
- recall
- f1

### **functions**

**compute metrics**

In [None]:
y_test.shape, y_pred.shape

In [None]:
from sklearn.metrics import accuracy_score     # tp + tn / all  : maximize correct predictions
from sklearn.metrics import precision_score    # tp / (tp + fp) : minimize false positives
from sklearn.metrics import recall_score       # tp / (tp + fn) : maximize true positives
from sklearn.metrics import f1_score           # harmonic mean of the precision and recall

def compute_class_accuracy(y_test, y_pred):
    df = pd.DataFrame({'label': y_test, 'correct': y_test == y_pred})
    return df.groupby('label')['correct'].mean()

def evaluation(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)

    labels = np.unique(y_pred)
    class_accuracy = compute_class_accuracy(y_test, y_pred)
    class_precision = precision_score(y_test, y_pred, average=None, labels=labels)
    class_recall = recall_score(y_test, y_pred, average=None, labels=labels)
    class_f1 = f1_score(y_test, y_pred, average=None, labels=labels)

    eval_perf_metrics_df = pd.DataFrame({
        "Category": labels,
        "Accuracy": class_accuracy.values,
        "Precision": class_precision,
        "Recall": class_recall,
        "F1 Score": class_f1
    })

    return accuracy, eval_perf_metrics_df

**Plot metrics**

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_metrics(accuracy, metrics_df):
    '''
        metrics_df : df with following format
            metrics_df = pd.DataFrame({
                "Category": labels,
                "Accuracy": class_accuracy.values,
                "Precision": class_precision,
                "Recall": class_recall,
                "F1 Score": class_f1
            })
    '''
    categories = metrics_df['Category']

    plt.figure(figsize=(8, 5))

    plt.plot(categories, metrics_df['F1 Score'], marker='o', label='F1 Score')
    plt.plot(categories, metrics_df['Precision'], marker='s', label='Precision')
    plt.plot(categories, metrics_df['Recall'], marker='^', label='Recall')

    plt.axhline(y=accuracy, color='r', linestyle='--', label=f'Accuracy ({accuracy:.2f})')

    plt.xlabel("Category")
    plt.ylabel("Score")
    plt.title("Evaluation Metrics per Category")
    plt.xticks(categories, categories, rotation=45, ha="right")
    plt.legend()
    plt.grid(True)

    print("Category distribution")
    print(category_distribution_df)
    print()
    print("Category metrics")
    print(metrics_df.round(2))
    print()
    plt.show()
    print()

**Confusion matrix**

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

def plot_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()

### ****

In [None]:
baseline_test_accuracy, eval_perf_metrics_df = evaluation(y_test, y_pred)

print('\nBaseline Test accuracy:', round(baseline_test_accuracy, 3), '\n')
print('Class metrics:\n')
eval_perf_metrics_df.round(2)

In [None]:
plot_metrics(baseline_test_accuracy, eval_perf_metrics_df)

In [None]:
plot_confusion_matrix(y_test, y_pred)

## 6. **Finding the best params with a Randomized Search**

##### **Pipeline and randomized search- Details**

**Preproc functions**

In [None]:
clean_X = lambda X, threshold=20: [basic_cleaning(sentence, threshold) for sentence in X]
preproc_X = lambda X, threshold=750: [preproc(sentence, threshold) for sentence in X]

**Pipeline**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from functools import partial

resampling_wrapper = lambda X, y=None: resampling(X, y) if y is not None else X

pipeline = Pipeline([
    ('clean', FunctionTransformer(clean_X)),
    ('preproc', FunctionTransformer(preproc_X)),
    ('tfidf', TfidfVectorizer()),
    ('resample', FunctionTransformer(partial(resampling_wrapper))),
    ('estimator', RandomForestClassifier())
])

In [None]:
# debug
# pipeline.named_steps.keys()

In [None]:
#for param, value in pipeline.get_params().items():
#    print(f"{param}: {value}")

**Params to evaluate**

In [None]:
from itertools import product

param_grid = {
    'clean__kw_args': [{"threshold": 17}],
    'preproc__kw_args': [{"threshold": 750}],

    'tfidf__max_df' : [0.9],                             # limit high frequency words
    'tfidf__max_features': [7500],    # vocabulary size

    'estimator__n_estimators': [300],
    'estimator__min_samples_split': [5],             # min samples required to split
    'estimator__class_weight' : ['balanced'],
    'estimator__random_state' : [42]
}

num_configurations = len(list(product(*param_grid.values())))
grid_estimated_time = 3 * num_configurations * train_efficiency_metrics_df["metrics"].loc["total_latency_sec"]
random_estimated_time = 3 * 20 * train_efficiency_metrics_df["metrics"].loc["total_latency_sec"]

print(f"Total configurations: {num_configurations}")
print(f'Estimated time to complete with grid search : {round(grid_estimated_time, 2)} sec')
print(f'Estimated time to complete with randomized search : {round(random_estimated_time, 2)} sec')

**Search**

In [None]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Encode labels
y = df['label']
le = LabelEncoder()
y_cat = le.fit_transform(y)

# Split data
X = df['quote']
X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, stratify=y_cat, random_state=42
)

# Run grid search
search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=20,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)
search.fit(X_train, y_train)

# Evaluate
print('Best parameters:')
print(search.best_params_)
print(f"\nBest accuracy: {round(search.best_score_, 3)}\n")

##### **Best params**

In [None]:
print('Best parameters:')
[print(f'{k}:{v}') for (k, v) in search.best_params_.items()]
print(f"\nBest accuracy: {round(search.best_score_, 3)}\n")

## 7. **Evaluating the best model**

##### **Prepare data**

**Feature and target**

In [None]:
X = df['quote']
y = df['label']

**Encode labels**

In [None]:
# Encode labels
le = LabelEncoder()
y_cat = le.fit_transform(y)

**Clean, preproc, vectorize X**

In [None]:
X_clean = X.apply(basic_cleaning)
X_preproc = X_clean.apply(preproc)

In [None]:
search.best_params_

In [None]:
tfidf_params = {k.replace("tfidf__", ""): v for k, v in search.best_params_.items() if k.startswith("tfidf__")}
#tfidf_params

In [None]:
vectorizer = TfidfVectorizer(**tfidf_params)

X_vectorized = pd.DataFrame(
    vectorizer.fit_transform(X_preproc).toarray(),
    columns=vectorizer.get_feature_names_out(),
    index=X_preproc.index
)

In [None]:
#print("TfidfVectorizer Parameters:", vectorizer.get_params())

**Split data**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y_cat, test_size=0.2, stratify=y_cat, random_state=42
)
print('types', type(X_train), type(X_test), type(y_train), type(y_test))
'Shapes', X_train.shape, X_test.shape, y_train.shape, y_test.shape

##### **Monitor best model energy consumption during training and inference - details**

**Monitor best model during training**

In [None]:
search.best_params_

In [None]:
estimator_params = {k.replace("estimator__", ""): v for k, v in search.best_params_.items() if k.startswith("estimator__")}
#estimator_params

In [None]:
best_model = RandomForestClassifier(**estimator_params)
#print("RandomForestClassifier Parameters:", best_model.get_params())

In [None]:
best_model, best_model_train_metrics_df = monitor_training(best_model, X_train, y_train)
print(best_model_train_metrics_df)
print()

**Monitor best model during inference**

In [None]:
y_pred, best_model_inf_metrics_df = monitor_inference(best_model, X_test)

print(best_model_inf_metrics_df)
print()

##### **Monitor best model energy consumption during training and inference - summary**

In [None]:
df_combined = pd.concat([train_efficiency_metrics_df, inf_efficiency_metrics_df], axis=1)
df_combined.columns = ['training', 'inference']
df_combined

##### **Evaluation**

In [None]:
best_model_test_accuracy, best_model_eval_perf_metrics_df = evaluation(y_test, y_pred)

print('\nAccuracy:', round(best_model_test_accuracy, 3), '\n')
print('Class metrics:')
best_model_eval_perf_metrics_df.round(2)

In [None]:
plot_metrics(best_model_test_accuracy, best_model_eval_perf_metrics_df)

**Confusion matrix**

In [None]:
plot_confusion_matrix(y_test, y_pred)

## 8. **Save metadata**

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
timestamp

In [None]:
note = "Baseline ML, improved preproc, param search"
model_name = 'RandomForestClassifier'

In [None]:
filename = (
    "metrics/"
    f"{'_'.join(model_name.split('/'))}"
    f"_accuracy_{int(best_model_test_accuracy * 100)}"
    f"_dt_{timestamp.replace(':', '').replace('-', '')}"
    ".json"
)
filename

In [None]:
import os

os.makedirs("metrics", exist_ok=True)

data = {
    "model_name": model_name,
    "timestamp": timestamp,
    "note": note,

    "baseline_test_accuracy": baseline_test_accuracy,
    "best_model_test_accuracy": best_model_test_accuracy,

    "total_training_latency_sec": train_efficiency_metrics_df['metrics'].loc['total_latency_sec'],
    "total_training_energy_conso_kWh": train_efficiency_metrics_df['metrics'].loc['total_energy_conso_kWh'],

    "total_inference_latency_sec": inf_efficiency_metrics_df['metrics'].loc['total_latency_sec'],
    "total_inference_energy_conso_kWh": inf_efficiency_metrics_df['metrics'].loc['total_energy_conso_kWh'],

    "sample_inference_latency_sec": inf_efficiency_metrics_df['metrics'].loc['sample_latency_sec'],
    "sample_inference_energy_conso_kWh": inf_efficiency_metrics_df['metrics'].loc['sample_energy_conso_kWh'],

    "train_size" : y_train.shape[0],
    "test_size" :  y_test.shape[0],
    "class_performance_metrics": eval_perf_metrics_df.to_dict(orient="records"),
    "search_best_params" : search.best_params_,
    "training_efficiency_metrics": train_efficiency_metrics_df.to_dict(orient="records"),
    "inference_efficiency_metrics": inf_efficiency_metrics_df.to_dict(orient="records")
}
data

In [None]:
import json

with open(filename, "w") as f:
    json.dump(data, f, indent=4)

print(f"Metrics saved to {filename}")