In [4]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [5]:
path = "./archive/stemmed_cleaned_data.csv"

df = pd.read_csv(path, delimiter=",", quotechar='"')

# print(df.head())

In [6]:
# Prétraitement des tags
tags = [
    "Blockchain", "Data Science", "Technology", "Programming", "Poetry",
    "Cryptocurrency", "Machine Learning", "Life", "Bitcoin", "Writing",
    "Politics", "Startup", "Life Lessons", "Self Improvement", "Covid 19",
    "Software Development", "Love", "Python", "Business", "Health"
]

top_20_tags = tags[:20].copy()

df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

df = df[df['tags'].apply(lambda tags: any(tag in top_20_tags for tag in tags))].copy()

df['main_tag'] = df['tags'].apply(lambda tags: next((tag for tag in tags if tag in top_20_tags), None))

df = df.dropna(subset=['clean_text'])  

In [7]:
# %% Transformation TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df['clean_text'])

In [8]:
#Réduction de Dimension avec LDA (LDA nécessite y catégorique)
y = df['main_tag'].astype('category').cat.codes  # Encodage des classes
lda = LDA(n_components=min(len(np.unique(y))-1, 19))  # LDA limité à nb_classes-1
X_lda = lda.fit_transform(X_tfidf.toarray(), y)

In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# %% Division des données en train/test
X_train, X_test, y_train, y_test = train_test_split(X_lda, y, test_size=0.2, random_state=42)

# %% Liste des valeurs de C à tester
C_values = [0.01, 0.1, 1, 10, 100]
results = []

# %% Test du SVM pour différentes valeurs de C
for C in C_values:
    try:
        print(f"\n🔹 Entraînement avec C = {C}", flush=True)  # Forçage affichage
        svm_model = SVC(kernel='rbf', C=C)  
        svm_model.fit(X_train, y_train)
        y_pred = svm_model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        results.append(acc)  # Stocke la précision pour chaque C

        print(f"✔ C = {C} → Accuracy: {acc:.4f}", flush=True)
        print(classification_report(y_test, y_pred))

    except Exception as e:
        print(f"❌ Erreur avec C = {C} : {e}", flush=True)

# %% Moyenne des précisions
mean_accuracy = np.mean(results)
print(f"\n📌 Moyenne de l'accuracy sur toutes les valeurs de C: {mean_accuracy:.4f}")
print(f"📊 Résultats complets : {results}")




🔹 Entraînement avec C = 0.01
✔ C = 0.01 → Accuracy: 0.6283
              precision    recall  f1-score   support

           0       0.57      0.44      0.50       535
           1       0.62      0.75      0.68      1012
           2       0.56      0.56      0.56       548
           3       0.70      0.72      0.71       687
           4       0.55      0.46      0.50       666
           5       0.56      0.60      0.58       799
           6       0.79      0.65      0.72       586
           7       0.42      0.44      0.43       677
           8       0.50      0.48      0.49       744
           9       0.61      0.56      0.58       527
          10       0.69      0.73      0.71       925
          11       0.67      0.83      0.74      1037
          12       0.84      0.86      0.85       943
          13       0.61      0.73      0.67       864
          14       0.59      0.53      0.56       536
          15       0.52      0.53      0.52       632
          16       0.