In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import pickle
import json


In [3]:
DATA_PATH = r"C:\Users\meddh\Trendify\Data\social_media_viral_content_dataset.csv"

df = pd.read_csv(DATA_PATH)
df.head()


Unnamed: 0,post_id,platform,content_type,topic,language,region,post_datetime,hashtags,views,likes,comments,shares,engagement_rate,sentiment_score,is_viral
0,SM_100000,Instagram,text,Sports,ur,UK,2024-12-10 00:00:00,#tech #funny #music,2319102,122058,15800,861,0.0598,0.464,1
1,SM_100001,Instagram,carousel,Sports,ur,Brazil,2024-10-13 00:00:00,#news #fyp #funny #ai #trending,2538464,110368,11289,54887,0.0695,-0.8,1
2,SM_100002,YouTube Shorts,video,Technology,ur,UK,2024-05-03 00:00:00,#ai #news,1051176,87598,47196,44132,0.1702,0.416,0
3,SM_100003,X,text,Politics,ur,US,2024-08-04 00:00:00,#ai #funny,5271440,329465,774,59736,0.074,0.877,1
4,SM_100004,YouTube Shorts,text,Education,es,US,2024-03-28 00:00:00,#news #ai #viral #funny #fyp,3186256,199141,5316,83105,0.0903,0.223,1


In [4]:
df["post_datetime"] = pd.to_datetime(df["post_datetime"])

df["post_hour"] = df["post_datetime"].dt.hour
df["post_day"] = df["post_datetime"].dt.day
df["post_month"] = df["post_datetime"].dt.month


In [5]:
TARGET = "is_viral"

categorical_features = [
    "platform",
    "content_type",
    "topic",
    "language",
    "region"
]

numerical_features = [
    "sentiment_score",
    "post_hour",
    "post_day",
    "post_month",
    "views",
    "likes",
    "shares",
    "comments",
    "engagement_rate"
]

features = categorical_features + numerical_features

X = df[features]
y = df[TARGET]


In [6]:
from sklearn.model_selection import StratifiedKFold, cross_validate


In [7]:
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
        ("num", StandardScaler(), numerical_features)
    ]
)


In [9]:
naive_bayes_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", GaussianNB())
])


In [10]:
scoring = {
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1"
}

cv_results = cross_validate(
    naive_bayes_pipeline,
    X,
    y,
    cv=skf,
    scoring=scoring,
    return_train_score=False
)


In [11]:
metrics_naive_bayes = {
    "accuracy": cv_results["test_accuracy"].mean(),
    "precision": cv_results["test_precision"].mean(),
    "recall": cv_results["test_recall"].mean(),
    "f1_score": cv_results["test_f1"].mean()
}

metrics_naive_bayes


{'accuracy': np.float64(0.96),
 'precision': np.float64(0.947751928919843),
 'recall': np.float64(0.9978545826932924),
 'f1_score': np.float64(0.9721419151024415)}

In [12]:
# Entraînement final sur toutes les données
naive_bayes_pipeline.fit(X, y)

# Sauvegarde du modèle entraîné
pickle.dump(
    naive_bayes_pipeline,
    open("../../models/trained/naive_bayes.pkl", "wb")
)
