In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, f1_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

data = pd.read_csv(r'C:\Users\erich\Desktop\DS_project\data\data_30_sentences.csv')

lemmatizer = WordNetLemmatizer()
def clean_data(df):
    df = df[df['HTML_Content'] != 'Not Rated']
    df = df.dropna(subset=['TXT_Content'])

    def clean_text(text):
        if not isinstance(text, str):
            text = str(text)
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.lower()
        tokenized_text = word_tokenize(text)
        cleaned_text = [lemmatizer.lemmatize(word) for word in tokenized_text if word not in set(stopwords.words('english'))]
        return ' '.join(cleaned_text)

    df['TXT_Content'] = df['TXT_Content'].apply(clean_text)
    return df

cleaned_data = clean_data(data)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cleaned_data['TXT_Content'])
y = cleaned_data['HTML_Content']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Naive Bayes': MultinomialNB(alpha=0.5, fit_prior=True),
    'Random Forest': RandomForestClassifier(max_depth=None, min_samples_split=5, n_estimators=400),
    'SVM': SVC(C=1, gamma=1, kernel='rbf', probability=True)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"{name} - Accuracy: {accuracy}, Recall: {recall}, F1-Score: {f1}")




Naive Bayes - Accuracy: 0.37488284910965325, Recall: 0.37488284910965325, F1-Score: 0.23541070124916727
Random Forest - Accuracy: 0.4329896907216495, Recall: 0.4329896907216495, F1-Score: 0.35207640174548743
SVM - Accuracy: 0.42642924086223055, Recall: 0.42642924086223055, F1-Score: 0.36859497060248664


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import classification_report
from scipy.sparse import hstack
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

data = pd.read_csv(r'C:\Users\erich\Desktop\DS_project\data\30_topic.csv')

data = data[data['HTML_Content'] != 'Not Rated']
data = data.dropna(subset=['TXT_Content'])

lemmatizer = WordNetLemmatizer()
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    tokenized_text = word_tokenize(text)
    cleaned_text = [lemmatizer.lemmatize(word) for word in tokenized_text if word not in set(stopwords.words('english'))]
    return ' '.join(cleaned_text)

data['TXT_Content'] = data['TXT_Content'].apply(clean_text)

vectorizer = TfidfVectorizer()
X_text = vectorizer.fit_transform(data['TXT_Content'])

mlb = MultiLabelBinarizer()
X_topic = mlb.fit_transform(data['Topic'])

weight_factor = 0.5
X_text_weighted = X_text.multiply(weight_factor)
X_topic_weighted = X_topic * weight_factor

X = hstack([X_text_weighted, X_topic_weighted])

le = LabelEncoder()
y = le.fit_transform(data['HTML_Content'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:

models = {
    'Naive Bayes': MultinomialNB(alpha=0.5, fit_prior=True),
    'Random Forest': RandomForestClassifier(max_depth=None, min_samples_split=5, n_estimators=400),
    'SVM': SVC(C=1, gamma=1, kernel='rbf', probability=True)
}


for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"{name} - Accuracy: {accuracy}, Recall: {recall}, F1-Score: {f1}")

Naive Bayes - Accuracy: 0.396231, Recall: 0.396231, F1-Score: 0.245807
Random Forest - Accuracy: 0.455482, Recall: 0.455482, F1-Score: 0.38085
SVM - Accuracy: 0.439241, Recall: 0.439241, F1-Score: 0.376052
