In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

lemmatizer = WordNetLemmatizer()

def clean_data(df):

    df = df[df['HTML_Content'].isin(['Explicit', 'General Audiences', 'Mature', 'Not Rated', 'Teen And Up Audiences'])]
    df = df.dropna(subset=['TXT_Content'])

    def clean_text(text):
        if not isinstance(text, str):
            text = str(text)
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.lower()
        tokenized_text = word_tokenize(text)
        cleaned_text = [lemmatizer.lemmatize(word) for word in tokenized_text if word not in set(stopwords.words('english'))]
        return ' '.join(cleaned_text)

    df['TXT_Content'] = df['TXT_Content'].apply(clean_text)
    return df

dir_path = 'C:/Users/erich/Desktop/DS_project/data'

for filename in os.listdir(dir_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(dir_path, filename)
        data = pd.read_csv(file_path)
        cleaned_data = clean_data(data)
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(cleaned_data['TXT_Content'])
        y = cleaned_data['HTML_Content']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        clf = svm.SVC(C=1, gamma=1, kernel='rbf')

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        print("File:", filename)
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Recall:", recall_score(y_test, y_pred, average='macro'))
        print("F1:", f1_score(y_test, y_pred, average='macro'))
        print("\n")


File: data_10_sentences.csv
Accuracy: 0.37385512073272276
Recall: 0.2661005608913996
F1: 0.23021419143742108


File: data_15_sentences.csv
Accuracy: 0.38467943380516234
Recall: 0.27792286180734893
F1: 0.24687846605574712


File: data_20_sentences.csv
Accuracy: 0.39050791007493757
Recall: 0.28803638546601384
F1: 0.26391068340409507


File: data_30_sentences.csv
Accuracy: 0.39666666666666667
Recall: 0.29859566330877685
F1: 0.27898821888311487


File: data_40_sentences.csv
Accuracy: 0.39666666666666667
Recall: 0.3033771033096844
F1: 0.28791245160017054


File: data_50_sentences.csv
Accuracy: 0.39217318900915904
Recall: 0.2976025873433687
F1: 0.2812248522663511


File: data_5_sentences.csv
Accuracy: 0.37468776019983346
Recall: 0.2660953857955118
F1: 0.22841471501185415


File: data_60_sentences.csv
Accuracy: 0.39633638634471274
Recall: 0.30518368747859204
F1: 0.291014736586415


