In [2]:
import pandas as pd

In [3]:
# Define the file paths
train_file = '../data/train.txt'
dev_file = '../data/dev.txt'

# Read the files into DataFrames
train_df = pd.read_csv(train_file, sep='\t', header=None, names=['text', 'label'])
dev_df = pd.read_csv(dev_file, sep='\t', header=None, names=['text', 'label'])

# Display the first few rows of the training data
print(train_df.head())

                                                text label
0  Saint-Nazaire se rêve en capitale des #énergie...     =
1  4eme Conférence internationale sur le changeme...     =
2  Rencontres #windustry 2014 Sascha Wiesner décr...     =
3  #Photos :Dans l’Ouest américain,les stigmates ...     -
4  Parc #éolien: entente conclut entre Port-Carti...     +


In [4]:
train_df.describe()

Unnamed: 0,text,label
count,6925,6920
unique,6921,3
top,"J'aime une vidéo @YouTube : ""Fabriquer une éol...",=
freq,2,3102


In [5]:
dev_df.describe()

Unnamed: 0,text,label
count,688,688
unique,688,3
top,"Les Éd. De #logiciel, un écosystème plein de v...",=
freq,1,323


In [6]:
train_df.shape

(6925, 2)

In [7]:
dev_df.shape

(688, 2)

In [17]:
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stopwords

In [18]:
!python -m spacy download fr_core_news_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m:01[0m:02[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [19]:
nlp = spacy.load("fr_core_news_sm", disable=["tagger", "parser", "ner"])  # Disable unnecessary components for speed
fr_stopwords = set(fr_stopwords)  # Cache stopwords for faster lookup

In [20]:
def preprocess_french_text(text):
    # Tokenize and lemmatize
    doc = nlp(text)
    # Remove stopwords and punctuation, lemmatize, and lowercase tokens
    tokens = [token.lemma_.lower() for token in doc if token.text.lower() not in fr_stopwords and not token.is_punct]
    return tokens

In [21]:
train_df['tokens'] = train_df['text'].apply(preprocess_french_text)
dev_df['tokens'] = dev_df['text'].apply(preprocess_french_text)

In [22]:
train_df.head()

Unnamed: 0,text,label,tokens
0,Saint-Nazaire se rêve en capitale des #énergie...,=,"[saint-nazaire, rêve, capitale, énergie, marin..."
1,4eme Conférence internationale sur le changeme...,=,"[4eme, conférence, international, changement, ..."
2,Rencontres #windustry 2014 Sascha Wiesner décr...,=,"[rencontres, windustry, 2014, sascha, wiesner,..."
3,"#Photos :Dans l’Ouest américain,les stigmates ...",-,"[photo, ouest, américain, stigmate, sécheresse..."
4,Parc #éolien: entente conclut entre Port-Carti...,+,"[parc, éolien, entente, conclure, port-cartier..."


In [24]:
# Vérifier les valeurs manquantes
print("Nombre de valeurs manquantes dans la colonne 'label':", train_df['label'].isna().sum())
print("Indices des lignes avec des labels manquants:", train_df.index[train_df['label'].isna()].tolist())

# Supprimer les lignes avec des labels manquants
train_df_clean = train_df.dropna(subset=['label'])
print(f"Dimensions d'origine: {train_df.shape}, Nouvelles dimensions: {train_df_clean.shape}")

Nombre de valeurs manquantes dans la colonne 'label': 5
Indices des lignes avec des labels manquants: [301, 2553, 2554, 2555, 2662]
Dimensions d'origine: (6925, 3), Nouvelles dimensions: (6920, 3)


In [25]:
# Utiliser les données nettoyées pour créer X_train et y_train
X_train = train_df_clean['tokens'].apply(lambda x: ' '.join(x))  # Join tokens into a single string
X_dev = dev_df['tokens'].apply(lambda x: ' '.join(x))

# Labels
y_train = train_df_clean['label']
y_dev = dev_df['label']

In [26]:
# Supprimer les lignes avec des labels manquants
train_df_clean = train_df.dropna(subset=['label'])
print(f"Dimensions d'origine: {train_df.shape}, Nouvelles dimensions: {train_df_clean.shape}")

Dimensions d'origine: (6925, 3), Nouvelles dimensions: (6920, 3)


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data (using clean data)
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the development data
X_dev_tfidf = vectorizer.transform(X_dev)

In [28]:
from sklearn.linear_model import LogisticRegression

# Initialize the model with increased max_iter
model = LogisticRegression(max_iter=1000)

# Train the model with clean data
model.fit(X_train_tfidf, y_train)

# Évaluer le modèle
y_pred = model.predict(X_dev_tfidf)
from sklearn.metrics import classification_report, accuracy_score
print(f"Précision: {accuracy_score(y_dev, y_pred)*100:.2f}%")
print(classification_report(y_dev, y_pred))

Précision: 66.28%
              precision    recall  f1-score   support

           +       0.65      0.55      0.60       208
           -       0.70      0.52      0.59       157
           =       0.66      0.81      0.72       323

    accuracy                           0.66       688
   macro avg       0.67      0.62      0.64       688
weighted avg       0.66      0.66      0.66       688

