In [1]:
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import spacy
spacy.cli.download("en_core_web_sm")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import re

import emoji


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn import svm


import optuna
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold#, GridSearchCV

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp = spacy.load("en_core_web_sm")

df_path= "../../scitweets.tsv"
df = pd.read_csv(df_path, sep='\t', header=0)

df = df.drop(columns=[col for col in df.columns if "Unnamed" in col] + ["tweet_id"])
df["text"] = df["text"].astype(str)

In [3]:
class EmojiExtractor(BaseEstimator, TransformerMixin):
	def fit(self, X, y=None):
		return self

	def transform(self, X):
		X = X.copy()

		def count_emojis(text):
			return sum(1 for char in text if char in emoji.EMOJI_DATA)

		def replace_emojis(text):
			return emoji.demojize(text, delimiters=(" ", " "))

		X["emoji_count"] = X["text"].apply(count_emojis)
		X["text"] = X["text"].apply(replace_emojis)

		return X

class LinkExtractor(BaseEstimator, TransformerMixin):
	def fit(self, X, y=None):
		return self

	def transform(self, X):
		X = X.copy()

		def count_links(text):
			return len(re.findall(r'https?:\/\/.*[\r\n]*', text))

		def remove_links(text):
			return re.sub(r'https?:\/\/\S+', '[url]', text)

		X["link_count"] = X["text"].apply(count_links)
		X["text"] = X["text"].apply(remove_links)

		return X
	
class TextPreprocessor(BaseEstimator, TransformerMixin):
	def fit(self, X, y=None):
		return self

	def transform(self, X):
		X = X.copy()
		def preprocess(text):
			doc = nlp(text.lower())
			return " ".join(token.lemma_ for token in doc if not token.is_stop and not token.is_punct)

		X["text"] = X["text"].apply(preprocess)
		return X

## Define the pipeline for the model

In [4]:
X = df.copy().drop(columns=["scientific_claim", "scientific_reference", "scientific_context"])
y = X.pop("science_related")

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5, test_size=0.2)
pd.set_option('display.max_colwidth', 110)
X_train.head()

Unnamed: 0,text
938,.@Frontier_Usyd is hosting a Frontotemporal Dementia (FTD) Information & Support Day for Families and Care...
470,If only people would stop interfering in other people's business 😜😜😜😜😜😜😜😜
326,when will the underwear stop flying
406,@martinhanratty @AtyHans @thenotimer I listened to a gospel radio station for a couple of hours--the confi...
953,when truth untold stops playing and the lights turn green.... https://t.co/lBFhLHyUbn


In [5]:
def display_result(predicted_y, true_y ):
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))
    cm = confusion_matrix(true_y, predicted_y)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non science related", "Science related"],
            yticklabels=["Non science related", "Science related"], ax=axes[0])

    axes[0].set_xlabel("Predicted Label")
    axes[0].set_ylabel("True Label")
    axes[0].set_title("Confusion Matrix")

    target_names = ['Non science related', 'Science related']
    class_report = classification_report(true_y, predicted_y, labels=[0, 1], target_names=target_names)

    axes[1].text(0, 0.5, class_report, fontsize=12, family='monospace')
    axes[1].axis("off")
    axes[1].set_title("Classification Report")

    plt.tight_layout()
    plt.show()

In [6]:
column_transformer = ColumnTransformer([
    ("tfidf", TfidfVectorizer(stop_words='english', ngram_range=(1,4)), "text"),
    ("pass_features", "passthrough", ["emoji_count", "link_count"])
])


def getPipeline(model):
    return Pipeline([
        ("emoji_processing", EmojiExtractor()),
        ("link_processing", LinkExtractor()),
        ("text_processing", TextPreprocessor()),
        ("feature_vectorizer", column_transformer),
        ("classifier", model)
        #("classifier", svm.SVC(kernel='linear', class_weight='balanced'))
        #("classifier", RandomForestClassifier(random_state = 1))
        #("classifier", XGBRegressor( random_state=1))
    ])

In [None]:
def objective(trial):
  C = trial.suggest_loguniform('C', 1, 10.0)

  kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
  gamma = trial.suggest_loguniform('gamma', 7, 10.0) if kernel == 'rbf' else 'scale'
  model = svm.SVC(C=C, kernel=kernel, gamma=gamma, random_state=1, class_weight='balanced')

  pipeline = getPipeline(model)

  kfold = KFold(n_splits=3, random_state=1, shuffle=True)
  score = cross_val_score(pipeline, X=X_train, y=y_train,  cv=kfold, scoring='f1_macro').mean()

  return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=4)

[I 2025-05-01 13:25:52,972] A new study created in memory with name: no-name-937213c1-b13b-47e5-808a-31507f5355a6
  C = trial.suggest_loguniform('C', 1, 10.0)


In [None]:
print("Meilleurs hyperparamètres :")
print(study.best_params)
print(f"Meilleur score : {study.best_value:.3f}")


In [None]:
model = svm. SVC(
  C=study.best_params['C'],
  kernel=study.best_params['kernel'],
  random_state=1,
  class_weight='balanced'
  )
pipeline = getPipeline(model)
pipeline.fit(X_train, y_train)

In [None]:
predictions = pipeline.predict(X_test)
display_result(predictions, y_test )

In [None]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 4, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=1,
        class_weight='balanced'
    )

    pipeline = getPipeline(model)

    kfold = KFold(n_splits=3, random_state=1, shuffle=True)
    score = cross_val_score(pipeline, X=X_train, y=y_train, cv=kfold, scoring='f1_macro').mean()

    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=6)

In [None]:
model = RandomForestClassifier(
        n_estimators=study.best_params['n_estimators'],
        max_depth=study.best_params['max_depth'],
        min_samples_split=study.best_params['min_samples_split'],
        min_samples_leaf=study.best_params['min_samples_leaf'],
        random_state=1,
        class_weight='balanced'
    )
pipeline = getPipeline(model)
pipeline.fit(X_train, y_train)

In [None]:
predictions = pipeline.predict(X_test)
display_result(predictions, y_test )

In [None]:
misclassified_indices = np.where(predictions != y_test)[0]

# Récupérer les tweets mal classifiés
misclassified_tweets = X_test.iloc[misclassified_indices]['text'].tolist()
misclassified_true = y_test.iloc[misclassified_indices].tolist()
misclassified_pred = predictions[misclassified_indices].tolist()

misclassified_df = pd.DataFrame({
    'tweet': misclassified_tweets,
    'true_label': misclassified_true,
    'predicted_label': misclassified_pred
})
misclassified_df.head(20)

In [None]:
tfidf_vectorizer = pipeline.named_steps['feature_vectorizer'].named_transformers_['tfidf']

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_matrix = tfidf_vectorizer.transform(X['text'])
feature_scores = np.asarray(tfidf_matrix.sum(axis=0)).ravel()

feature_df = pd.DataFrame({'word': feature_names, 'tfidf_score': feature_scores})

feature_df.sort_values(by='tfidf_score', ascending=False).head(20)
