In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

lemmatizer = WordNetLemmatizer()

def clean_data(df):

    df = df[df['HTML_Content'].isin(['Explicit', 'General Audiences', 'Mature', 'Not Rated', 'Teen And Up Audiences'])]
    df = df.dropna(subset=['TXT_Content'])

    def clean_text(text):
        if not isinstance(text, str):
            text = str(text)
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.lower()
        tokenized_text = word_tokenize(text)
        cleaned_text = [lemmatizer.lemmatize(word) for word in tokenized_text if word not in set(stopwords.words('english'))]
        return ' '.join(cleaned_text)
    df['TXT_Content'] = df['TXT_Content'].apply(clean_text)
    return df

dir_path = 'C:/Users/erich/Desktop/DS_project/data'

for filename in os.listdir(dir_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(dir_path, filename)
        data = pd.read_csv(file_path)
        cleaned_data = clean_data(data)

        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(cleaned_data['TXT_Content'])
        y = cleaned_data['HTML_Content']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        clf = MultinomialNB(alpha=0.5, fit_prior=True)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        print("File:", filename)
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Recall:", recall_score(y_test, y_pred, average='macro'))
        print("F1:", f1_score(y_test, y_pred, average='macro'))
        print("\n")


File: data_10_sentences.csv
Accuracy: 0.34388009991673607
Recall: 0.22337086727658545
F1: 0.14536482754307165


File: data_15_sentences.csv
Accuracy: 0.33388842631140714
Recall: 0.2144187210282343
F1: 0.1313703828406652


File: data_20_sentences.csv
Accuracy: 0.33222314737718567
Recall: 0.21279669721871047
F1: 0.12828503652126427


File: data_30_sentences.csv
Accuracy: 0.33166666666666667
Recall: 0.21563768812233622
F1: 0.12967825241781578


File: data_40_sentences.csv
Accuracy: 0.32416666666666666
Recall: 0.21289780254922563
F1: 0.12321401379943167


File: data_50_sentences.csv
Accuracy: 0.33139050791007496
Recall: 0.2117182890855457
F1: 0.12554596495864895


File: data_5_sentences.csv
Accuracy: 0.3480432972522898
Recall: 0.2284277549515381
F1: 0.1565751918341684


File: data_60_sentences.csv
Accuracy: 0.33305578684429643
Recall: 0.21333412581008732
F1: 0.1268285293834604


