In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score

df = pd.read_csv(r'C:\Users\erich\Desktop\DS_project\data\cleaned_data.csv')

df = df[df['HTML_Content'].isin(['Teen And Up Audiences', 'Mature'])]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['TXT_Content'])
y = df['HTML_Content']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifiers = {
    "Naive Bayes": MultinomialNB(alpha=0.5, fit_prior=True),
    "Random Forest": RandomForestClassifier(max_depth=None, min_samples_split=5, n_estimators=400),
    "SVM": SVC(C=1, gamma=1, kernel='rbf', probability=True)
}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Classifier: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, pos_label='Mature'):.4f}")
    print(f"F1-score: {f1_score(y_test, y_pred, pos_label='Mature'):.4f}")
    print("-" * 50)




Classifier: Naive Bayes
Accuracy: 0.6768
Recall: 0.0104
F1-score: 0.0205
--------------------------------------------------
Classifier: Random Forest
Accuracy: 0.7056
Recall: 0.1510
F1-score: 0.2500
--------------------------------------------------
Classifier: SVM
Accuracy: 0.6920
Recall: 0.0990
F1-score: 0.1727
--------------------------------------------------
