# Comparison with Baselines

## === Setup ===

### Importing Libraries

In [None]:
import sys
import time
import pandas
import sklearn.metrics
import sklearn.ensemble
import sklearn.model_selection
import transformers

sys.path.append("../source")
import document
import data_preprocessing
import transformer_model

pandas.set_option("display.max_rows", None)
pandas.set_option("display.max_columns", None)
pandas.set_option("display.max_colwidth", None)

### Loading the Datasets

In [None]:
df_gc = pandas.read_csv("../datasets/green_claims.csv")
print("Dataset Size:", df_gc.shape)
df_gc.head()

In [None]:
df_so = pandas.read_csv("../datasets/sustainability_goals.csv")
df_so = df_so.dropna(subset=["Text Blocks", "Goal"])
df_so = df_so.drop_duplicates(subset=["Text Blocks"])

print("Dataset Size:", df_so.shape)
print("The Number of Goals:", df_so["Goal"].sum())
df_so.head()

### Preprocessing the Datasets

In [None]:
sustainability_keywords = [
    "green", "environment", "carbon", "footprint", "co2",  "emission", "pollution", "recycle", "waste", "plant", "energy", "renewable", "water", "electricity",
    "diversity", "employee", "women", "female", "human", "inclusion", "health", "safety", "security",
    # "goal", "sustainable", "zero", "right"
    ]
data_preprocessor = data_preprocessing.DataPreprocessing()

In [None]:
df_gc["text"] = df_gc["tweet"].copy()
# df_gc = data_preprocessor.clean_text_blocks(df_gc, "text", level="heavy")
df_gc["labels"] = df_gc["label_binary"].replace({"not_green": 0, "green_claim": 1})
df_gc = df_gc[["text", "labels"]]

In [None]:
df_so["text"] = df_so["Text Blocks"].copy()
# df_so = data_preprocessor.clean_text_blocks(df_so, "text", level="heavy")
df_so = data_preprocessor.filter_text_blocks(df_so, "text", keep_only_size=(0, 300), keep_only_keywords=sustainability_keywords)
df_so["labels"] = df_so["Goal"].copy()
df_so = df_so[["text", "labels"]]

### Splitting the Data

In [None]:
df_gc_train, df_gc_test = sklearn.model_selection.train_test_split(
    df_gc,
    test_size=0.2,
    stratify=df_gc["labels"],
    random_state=7
)
print("Train Set Size:", df_gc_train.shape)
print(df_gc_train["labels"].value_counts())
print("Test Set Size:", df_gc_test.shape)
print(df_gc_test["labels"].value_counts())

In [None]:
df_so_train, df_so_test = sklearn.model_selection.train_test_split(
    df_so,
    test_size=0.2,
    stratify=df_so["labels"],
    random_state=7
)

print("Train Set Size:", df_so_train.shape)
print(df_so_train["labels"].value_counts())
print("Test Set Size:", df_so_test.shape)
print(df_so_test["labels"].value_counts())

## === Baseline 1: BERTClaimBuster ===

In [None]:
t = time.time()
tokenizer = transformers.AutoTokenizer.from_pretrained("Nithiwat/bert-base_claimbuster")
model = transformers.AutoModelForSequenceClassification.from_pretrained("Nithiwat/bert-base_claimbuster", num_labels=3)#.to(device) 
pipe = transformers.TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
print(time.time() - t)
predictions = pipe(df_gc_test["text"].tolist())
y_predicted = [(p[2]["score"] >= p[0]["score"]) and (p[2]["score"] >= p[1]["score"]) for p in predictions]
evaluation_metrics = sklearn.metrics.classification_report(df_gc_test["labels"], y_predicted)
print(evaluation_metrics)

In [None]:
t = time.time()
tokenizer = transformers.AutoTokenizer.from_pretrained("Nithiwat/bert-base_claimbuster")
model = transformers.AutoModelForSequenceClassification.from_pretrained("Nithiwat/bert-base_claimbuster", num_labels=3)#.to(device) 
pipe = transformers.TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
print(time.time() - t)
predictions = pipe(df_so_test["text"].tolist())
y_predicted = [(p[2]["score"] >= p[0]["score"]) and (p[2]["score"] >= p[1]["score"]) for p in predictions]
evaluation_metrics = sklearn.metrics.classification_report(df_so_test["labels"], y_predicted)
print(evaluation_metrics)

## === Baseline 2: TFIDF + Random Forest ===

In [None]:
t = time.time()
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(min_df=3, max_df=0.5, analyzer="word")
x_train_vectorized = vectorizer.fit_transform(df_gc_train["text"])
x_test_vectorized = vectorizer.transform(df_gc_test["text"])
parameters_grid = {"n_estimators": range(50, 550, 50)}
model = sklearn.model_selection.GridSearchCV(sklearn.ensemble.RandomForestClassifier(), 
                                             parameters_grid, scoring="f1_macro", cv=4, n_jobs=-1)
model.fit(x_train_vectorized, df_gc_train["labels"])
print(time.time() - t)
print("Best found hyperparameters of Random Forest classfier = {}".format(model.best_params_))
y_predicted = model.predict(x_test_vectorized)
evaluation_metrics = sklearn.metrics.classification_report(df_gc_test["labels"], y_predicted)
print(evaluation_metrics)

In [None]:
t = time.time()
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(min_df=3, max_df=0.5, analyzer="word")
x_train_vectorized = vectorizer.fit_transform(df_so_train["text"])
x_test_vectorized = vectorizer.transform(df_so_test["text"])
parameters_grid = {"n_estimators": range(50, 550, 50)}
model = sklearn.model_selection.GridSearchCV(sklearn.ensemble.RandomForestClassifier(), 
                                             parameters_grid, scoring="f1_macro", cv=4, n_jobs=-1)
model.fit(x_train_vectorized, df_so_train["labels"])
print("Best found hyperparameters of Random Forest classfier = {}".format(model.best_params_))
print(time.time() - t)
y_predicted = model.predict(x_test_vectorized)
evaluation_metrics = sklearn.metrics.classification_report(df_so_test["labels"], y_predicted)
print(evaluation_metrics)

## === Baseline 3: Bin_RoBERTa ===

In [None]:
t = time.time()
target_values = ["Not Green", "Green"]
model = transformer_model.TextClassification(target_values, name="roberta-base", epochs=10, learning_rate=2e-5, batch_size=32, 
                                             weight_decay=0.0, save_to="../models/temp")
model.fit(df_gc_train, df_gc_test)
print(time.time() - t)

In [None]:
t = time.time()
target_values = ["Not Green", "Green"]
model = transformer_model.TextClassification(target_values, name="roberta-base", epochs=10, learning_rate=2e-5, batch_size=32, 
                                             weight_decay=0.0, save_to="../models/temp")
model.fit(df_so_train, df_so_test)
print(time.time() - t)

## === Training and Testing Our Model on the Dataset ===

In [None]:
t = time.time()
target_values = ["Not Green", "Green"]
model = transformer_model.TextClassification(target_values, name="distilroberta-base", epochs=10, learning_rate=5e-5, batch_size=16, 
                                             weight_decay=0.01, save_to="../models/temp")
model.fit(df_gc_train, df_gc_test)
print(time.time() - t)

In [None]:
t = time.time()
target_values = ["Not Green", "Green"]
model = transformer_model.TextClassification(target_values, name="distilroberta-base", epochs=10, learning_rate=5e-5, batch_size=16, 
                                             weight_decay=0.01, save_to="../models/temp")
model.fit(df_so_train, df_so_test)
print(time.time() - t)