In [None]:
import pandas as pd
import re
import numpy as np

In [None]:
# Read in data
train = pd.read_csv('data/train.csv')
val = pd.read_csv('data/val.csv')

In [None]:
# Helper function for cleaning text
def clean_html(text):
    if pd.isna(text):
        return text
    # Remove HTML tags
    clean = re.sub(r'<.*?>', '', str(text))
    # Remove extra whitespaces
    clean = re.sub(r'\s+', ' ', clean).strip()
    # Replace HTML entities
    clean = re.sub(r'&amp;', '&', clean)
    clean = re.sub(r'&lt;', '<', clean)
    clean = re.sub(r'&gt;', '>', clean)
    clean = re.sub(r'&quot;|&#34;', '"', clean)
    clean = re.sub(r'&apos;|&#39;', "'", clean)
    return clean

In [None]:
#cleaning data
train = train.map(clean_html)
val = val.map(clean_html)

In [None]:
#pre-processing for Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer 
count_vectorizer = CountVectorizer(stop_words='english')
train_features_cv = count_vectorizer.fit_transform(train['snip'])
val_features_cv = count_vectorizer.transform(val['snip'])

In [None]:
#training LR model for Count Vectorizer 
from sklearn.linear_model import LogisticRegression
LR_cv = LogisticRegression(solver='saga', max_iter=1000)
LR_cv.fit(train_features_cv, train['channel'])
predictions_cv = LR_cv.predict(val_features_cv)

In [None]:
#pre-processing for Tfdif Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
train_features_tv = tfidf_vectorizer.fit_transform(train['snip'])
val_features_tv = tfidf_vectorizer.transform(val['snip'])

In [None]:
#training LR model for Tfdif Vectorizer 
from sklearn.linear_model import LogisticRegression
LR_tv = LogisticRegression(solver='saga', max_iter=10000)
LR_tv.fit(train_features_tv, train['channel'])
predictions_tv = LR_tv.predict(val_features_tv)

In [None]:
#measuring accuracy 
from sklearn.metrics import accuracy_score
true = val['channel'].to_numpy()
accuracy_cv = accuracy_score(true, predictions_cv)
print(f"The accuracy of the Count Vectorizer Logistic Regressor is {accuracy_cv}")
accuracy_tv = accuracy_score(true, predictions_tv)
print(f"The accuracy of the Tfdif Vectorizer Logistic Regressor is {accuracy_tv}")

from sklearn.metrics import classification_report
print(f"The per-class accuracy of the Count Vectorizer Logistic Regressor is as below:") 
print(classification_report(true, predictions_cv, zero_division = np.nan))
print(f"The per-class accuracy of the Tfdif Vectorizer Logistic Regressor is as below:") 
print(classification_report(true, predictions_tv, zero_division = np.nan))