In [2]:
# 1. Setup
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
import joblib

# 2. Load cleaned data
df = pd.read_csv("cleaned_data.csv")

#  Inspect
df.head()


Unnamed: 0,Question,Answer,Category,text,clean_text
0,"South Korea's longest river, the Nakdong, flow...",Busan or Pusan,Geography,"South Korea's longest river, the Nakdong, flow...",south koreas longest river the nakdong flows t...
1,Developed by Francophone writers and politicia...,Aime Cesaire,Art and Literature,Developed by Francophone writers and politicia...,developed by francophone writers and politicia...
2,"In Argonauts of the Western Pacific, Bronislaw...",Kula or Kula ring or Kula exchange,Geography,"In Argonauts of the Western Pacific, Bronislaw...",in argonauts of the western pacific bronislaw ...
3,One of the deficiencies of the Standard Model ...,weak and gravity,Science and Nature,One of the deficiencies of the Standard Model ...,one of the deficiencies of the standard model ...
4,Appearing in the title of a 1982 book by evolu...,phenotype,Science and Nature,Appearing in the title of a 1982 book by evolu...,appearing in the title of a 1982 book by evolu...


In [3]:
# 3. Extract features and labels
X_raw = df["clean_text"]
y_raw = df["Category"]

# Encode labels as integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)

# See encoded labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)


Label Mapping: {'Art and Literature': 0, 'Entertainment': 1, 'Geography': 2, 'History': 3, 'Lifestyle': 4, 'Music': 5, 'Science and Nature': 6, 'Sport': 7}


In [4]:
# 4. Split into train/test sets
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.2, stratify=y, random_state=42
)


In [5]:
# 5. Tokenize + Vectorize + Save

# Both TF-IDF and CountVectorizer to see which performs better 
vectorizer_types = {
    "tfidf": TfidfVectorizer,
    "count": CountVectorizer
}

# Range of max_features to see if any lead to better outcomes 
max_features_list = [5000, 10000, 20000]

for vtype, VectClass in vectorizer_types.items():
    for n in max_features_list:
        vect = VectClass(
            max_features=n,
            ngram_range=(1, 2),
            stop_words="english"
        )
        X_train = vect.fit_transform(X_train_raw)
        X_test = vect.transform(X_test_raw)

        # Save vectorizer and matrix
        joblib.dump(vect, f"vectorizer_{vtype}_{n}.joblib")
        joblib.dump(X_train, f"X_train_{vtype}_{n}.joblib")
        joblib.dump(X_test, f"X_test_{vtype}_{n}.joblib")

        print(f"Saved {vtype} vectorizer and X for max_features={n}")

# Save y only once
joblib.dump(y_train, "y_train.joblib")
joblib.dump(y_test, "y_test.joblib")

Saved tfidf vectorizer and X for max_features=5000
Saved tfidf vectorizer and X for max_features=10000
Saved tfidf vectorizer and X for max_features=20000
Saved count vectorizer and X for max_features=5000
Saved count vectorizer and X for max_features=10000
Saved count vectorizer and X for max_features=20000


['y_test.joblib']

In [9]:
max_features_list = [30000, 40000, 50000]

for vtype, VectClass in vectorizer_types.items():
    for n in max_features_list:
        vect = VectClass(
            max_features=n,
            ngram_range=(1, 2),
            stop_words="english"
        )
        X_train = vect.fit_transform(X_train_raw)
        X_test = vect.transform(X_test_raw)

        # Save vectorizer and matrix
        joblib.dump(vect, f"vectorizer_{vtype}_{n}.joblib")
        joblib.dump(X_train, f"X_train_{vtype}_{n}.joblib")
        joblib.dump(X_test, f"X_test_{vtype}_{n}.joblib")

        print(f"Saved {vtype} vectorizer and X for max_features={n}")

Saved tfidf vectorizer and X for max_features=30000
Saved tfidf vectorizer and X for max_features=40000
Saved tfidf vectorizer and X for max_features=50000
Saved count vectorizer and X for max_features=30000
Saved count vectorizer and X for max_features=40000
Saved count vectorizer and X for max_features=50000


In [14]:
# Higher feature limits for further testing

max_features_list = [75000, 100000, 150000, 200000, 250000, 300000, 350000, 400000]

for n in max_features_list:
    vectorizer = TfidfVectorizer(
        max_features=n,
        ngram_range=(1, 2),
        stop_words="english"
    )

    X_train = vectorizer.fit_transform(X_train_raw)
    X_test = vectorizer.transform(X_test_raw)

    # Save vectorizer and transformed matrices
    joblib.dump(vectorizer, f"vectorizer_tfidf_{n}.joblib")
    joblib.dump(X_train, f"X_train_tfidf_{n}.joblib")
    joblib.dump(X_test, f"X_test_tfidf_{n}.joblib")

    print(f"✅ Saved TF-IDF vectorizer and matrices for max_features={n}")


✅ Saved TF-IDF vectorizer and matrices for max_features=75000
✅ Saved TF-IDF vectorizer and matrices for max_features=100000
✅ Saved TF-IDF vectorizer and matrices for max_features=150000
✅ Saved TF-IDF vectorizer and matrices for max_features=200000
✅ Saved TF-IDF vectorizer and matrices for max_features=250000
✅ Saved TF-IDF vectorizer and matrices for max_features=300000
✅ Saved TF-IDF vectorizer and matrices for max_features=350000
✅ Saved TF-IDF vectorizer and matrices for max_features=400000


In [13]:
# Unlimited featuress

vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    stop_words="english"
    # no max_features
)

X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

print(f"Full feature count: {X_train.shape[1]}")  # Should be ~427,290

# Save as usual
joblib.dump(vectorizer, "vectorizer_tfidf_full.joblib")
joblib.dump(X_train, "X_train_tfidf_full.joblib")
joblib.dump(X_test, "X_test_tfidf_full.joblib")


Full feature count: 427290


['X_test_tfidf_full.joblib']

In [8]:
# 6. Save encoded labels

joblib.dump(label_encoder, "label_encoder.joblib")


['label_encoder.joblib']