In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

lemmatizer = WordNetLemmatizer()

def clean_data(df):
    df = df[df['HTML_Content'].isin(['Explicit', 'General Audiences', 'Mature', 'Not Rated', 'Teen And Up Audiences'])]
    df = df.dropna(subset=['TXT_Content'])

    def clean_text(text):
        if not isinstance(text, str):
            text = str(text)
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.lower()
        tokenized_text = word_tokenize(text)
        cleaned_text = [lemmatizer.lemmatize(word) for word in tokenized_text if word not in set(stopwords.words('english'))]
        return ' '.join(cleaned_text)
    df['TXT_Content'] = df['TXT_Content'].apply(clean_text)
    return df

dir_path = 'C:/Users/erich/Desktop/DS_project/data'

for filename in os.listdir(dir_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(dir_path, filename)
        data = pd.read_csv(file_path)
        cleaned_data = clean_data(data)

        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(cleaned_data['TXT_Content'])
        y = cleaned_data['HTML_Content']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        clf = RandomForestClassifier(max_depth=None, min_samples_split=5, n_estimators=400)

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        print("File:", filename)
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Recall:", recall_score(y_test, y_pred, average='macro'))
        print("F1:", f1_score(y_test, y_pred, average='macro'))
        print("\n")


File: data_10_sentences.csv
Accuracy: 0.37385512073272276
Recall: 0.27231282622512404
F1: 0.24428873820402738


File: data_15_sentences.csv
Accuracy: 0.37801831806827646
Recall: 0.2748652411207571
F1: 0.24572018069255072


File: data_20_sentences.csv
Accuracy: 0.3871773522064946
Recall: 0.28453950979796144
F1: 0.260130409620739


File: data_30_sentences.csv
Accuracy: 0.38416666666666666
Recall: 0.2850876397699747
F1: 0.258843358812463


File: data_40_sentences.csv
Accuracy: 0.39166666666666666
Recall: 0.29628878504542466
F1: 0.2735422985017405


File: data_50_sentences.csv
Accuracy: 0.4013322231473772
Recall: 0.30120467956409275
F1: 0.28261070530357524


File: data_5_sentences.csv
Accuracy: 0.37801831806827646
Recall: 0.2780095301763804
F1: 0.2540882979158484


File: data_60_sentences.csv
Accuracy: 0.4013322231473772
Recall: 0.3049930059840989
F1: 0.2902996466421465


