In [3]:
from enum import Enum

import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, PredefinedSplit


class OutputEnum(Enum):
    MISLEADING_OTHER = "misleadingOther"
    MISLEADING_FACTUAL_ERROR = "misleadingFactualError"
    MISLEADING_MANIPULATED_MEDIA = "misleadingManipulatedMedia"
    MISLEADING_OUTDATED_INFORMATION = "misleadingOutdatedInformation"
    MISLEADING_MISSING_IMPORTANT_CONTEXT = "misleadingMissingImportantContext"
    MISLEADING_UNVERIFIED_CLAIM_AS_FACT = "misleadingUnverifiedClaimAsFact"
    MISLEADING_SATIRE = "misleadingSatire"
    TRUSTWORTHY_SOURCES = "trustworthySources"
    NOT_MISLEADING_FACTUALLY_CORRECT = "notMisleadingFactuallyCorrect"
    NOT_MISLEADING_OUTDATED_BUT_NOT_WHEN_WRITTEN = (
        "notMisleadingOutdatedButNotWhenWritten"
    )
    NOT_MISLEADING_CLEARLY_SATIRE = "notMisleadingClearlySatire"
    NOT_MISLEADING_PERSONAL_OPINION = "notMisleadingPersonalOpinion"


# Load the train split
df_train = pd.read_csv("train.tsv", sep="\t").sample(500_000, random_state=42)

# Load the test split
df_test = pd.read_csv("test.tsv", sep="\t").sample(75_00, random_state=42)

ps = PredefinedSplit([-1] * len(df_train) + [0] * len(df_test))
df_train_test = pd.concat([df_train, df_test])

df_val = pd.read_csv("validation.tsv", sep="\t")

bow_pipeline = make_pipeline(
    CountVectorizer(lowercase=True, stop_words="english", token_pattern=r"[a-z]{4,}"),
    RidgeClassifier(),
)
tfidf_pipeline = make_pipeline(
    TfidfVectorizer(lowercase=True, stop_words="english", token_pattern=r"[a-z]{4,}"),
    RidgeClassifier(),
)

In [4]:
bow_grid = {
    "countvectorizer__ngram_range": [(1, 1), (1, 2)],  # Unigrams or bigrams
    "countvectorizer__max_df": [
        0.8,
        0.9,
        0.99,
    ],  # Ignore terms that appear in more than X% of documents
    "countvectorizer__min_df": [
        1,
        2,
        5,
    ],  # Ignore terms that appear in fewer than X documents
    "ridgeclassifier__alpha": [0.1, 1.0, 10.0],
}

tfidf_grid = {
    "tfidfvectorizer__ngram_range": [(1, 1), (1, 2)],
    "tfidfvectorizer__max_df": [
        0.8,
        0.9,
        0.99,
    ],  # Ignore terms that appear in more than X% of documents
    "tfidfvectorizer__min_df": [1, 2, 5],
    "ridgeclassifier__alpha": [0.1, 1.0, 10.0],
}


In [None]:
bow_search = GridSearchCV(
    bow_pipeline, bow_grid, n_jobs=-1, verbose=2, refit=True, cv=ps
)
bow_search.fit(df_train_test["summary"], df_train_test[[o.value for o in OutputEnum]])
print(bow_search.best_params_)
print(
    classification_report(
        df_val[[o.value for o in OutputEnum]], bow_search.predict(df_val["summary"])
    )
)

In [None]:
tfidf_search = GridSearchCV(
    tfidf_pipeline, tfidf_grid, n_jobs=-1, verbose=2, refit=True, cv=ps
)
tfidf_search.fit(df_train_test["summary"], df_train_test[[o.value for o in OutputEnum]])
print(tfidf_search.best_params_)
print(
    classification_report(
        df_val[[o.value for o in OutputEnum]], tfidf_search.predict(df_val["summary"])
    )
)