In [2]:
# =============================
# Core Libraries
# =============================
import pandas as pd
import numpy as np
import re
import string
import joblib
import warnings

warnings.filterwarnings("ignore")

# =============================
# Visualization
# =============================
import matplotlib.pyplot as plt

%matplotlib inline

# =============================
# NLP (NLTK)
# =============================
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# =============================
# Scikit-learn: Preprocessing
# =============================
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, MinMaxScaler, StandardScaler

# =============================
# Feature Extraction
# =============================
from sklearn.feature_extraction.text import TfidfVectorizer



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zakar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zakar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zakar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\zakar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
with open("news_multilabel_train.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

In [5]:
with open("news_multilabel_validation.json", "r", encoding="utf-8") as f:
    val_data = json.load(f)

In [6]:
with open("news_multilabel_test.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

In [None]:
# ============================================================================
# TEXT PREPROCESSING FUNCTIONS
# ============================================================================
default_stopwords = stopwords.words('english')
default_stopwords = default_stopwords + ['said', 'would','even','according','could','year',
                                         'years','also','new','people','old','one','two','time',
                                         'first','last','say','make','best','get','three','make',
                                         'year old','told','made','like','take','many','set','number',
                                         'month','week','well','back']
BAD_SYMBOLS_RE = re.compile("[^a-zA-Z,\d]")
REPLACE_IP_ADDRESS = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')

def preprocessing_text(text):
    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s) if len(w)>=3]

    def clean_text(text):
        text = text.lower()
        text = text.replace('\n',' ').replace('\xa0',' ').replace('-',' ').replace('ó','o').replace('ğ','g').replace('á','a').replace("'"," ")
        text = re.sub(r'\d+','', text)
        text = re.sub(r'http\S+', '', text)
        text = BAD_SYMBOLS_RE.sub(' ', text)
        text = REPLACE_IP_ADDRESS.sub('', text)
        text = REPLACE_BY_SPACE_RE.sub(' ', text)
        text = ' '.join(word for word in text.split() if len(word)>3)
        return text

    def remove_special_characters(text, characters=string.punctuation.replace('-', '')):
        tokens = tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(characters + '0123456789')))
        return ' '.join(filter(None, [pattern.sub('', t) for t in tokens]))

    def lemm_text(text, lemm=WordNetLemmatizer()):
        tokens = tokenize_text(text)
        return ' '.join([lemm.lemmatize(t) for t in tokens])

    def remove_stopwords(text, stop_words=default_stopwords):
        tokens = [w for w in tokenize_text(text) if w not in stop_words]
        return ' '.join(tokens)
    
    text = text.strip(' ')
    text = clean_text(text)
    text = remove_special_characters(text)
    text = lemm_text(text)
    text = remove_stopwords(text)
    return text

# ============================================================================
# PREPROCESS TRAIN DATA
# ============================================================================
print("\n" + "=" * 60)
print("PREPROCESSING TRAIN DATA")
print("=" * 60)

train_texts = []
train_labels = []

for datum in train_data:
    processed = preprocessing_text(datum["text"])
    train_texts.append(processed)
    train_labels.append(datum["labels"])

print(f"Train samples: {len(train_texts)}")

# ============================================================================
# PREPROCESS VALIDATION DATA
# ============================================================================
print("\n" + "=" * 60)
print("PREPROCESSING VALIDATION DATA")
print("=" * 60)

val_texts = []
val_labels = []

for datum in val_data:
    processed = preprocessing_text(datum["text"])
    val_texts.append(processed)
    val_labels.append(datum["labels"])

print(f"Validation samples: {len(val_texts)}")

# ============================================================================
# TF-IDF VECTORIZATION
# ============================================================================
print("\n" + "=" * 60)
print("TF-IDF VECTORIZATION")
print("=" * 60)

# Create and fit TF-IDF vectorizer on training data
tfidf_vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    analyzer='word', 
    ngram_range=(1,1), 
    norm='l2', 
    max_features=8192
)

# Fit on train, transform both train and val
X_train = tfidf_vectorizer.fit_transform(train_texts)
X_val = tfidf_vectorizer.transform(val_texts)
y_train = np.array(train_labels, dtype=int)
y_val = np.array(val_labels, dtype=int)

print(f"\nVocabulary size: {len(tfidf_vectorizer.vocabulary_)}")
print(f"Train feature matrix shape: {X_train.shape}")
print(f"Val feature matrix shape: {X_val.shape}")
print(f"Train labels shape: {y_train.shape}")
print(f"Val labels shape: {y_val.shape}")



PREPROCESSING TRAIN DATA
Train samples: 55000

PREPROCESSING VALIDATION DATA
Validation samples: 5000

TF-IDF VECTORIZATION

Vocabulary size: 8192
Train feature matrix shape: (55000, 8192)
Val feature matrix shape: (5000, 8192)
Train labels shape: (55000, 14)
Val labels shape: (5000, 14)


In [12]:
# Preprocess test data
test_texts = []
test_labels = []
for datum in test_data:
    processed = preprocessing_text(datum["text"])
    test_texts.append(processed)
    test_labels.append(datum["labels"])

print(f"\nTest samples: {len(test_texts)}")

X_test = tfidf_vectorizer.transform(test_texts)
print(f"Test feature matrix shape: {X_test.shape}")
y_test = np.array(test_labels, dtype=int)



Test samples: 5000
Test feature matrix shape: (5000, 8192)


In [10]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from scipy.sparse import issparse

print("="*60)
print("TruncatedSVD Feature Selection for Sparse Data")
print("="*60)

# ============================================================================
# Method 1: Direct Feature Selection with TruncatedSVD
# ============================================================================
print("\n" + "="*60)
print("Method 1: TruncatedSVD Feature Importance")
print("="*60)

# Fit TruncatedSVD
n_components = 1024  # Number of components to keep
svd = TruncatedSVD(
    n_components=n_components,
    algorithm='randomized',
    n_iter=10,
    random_state=42
)

print(f"\nFitting TruncatedSVD with {n_components} components...")
X_train_transformed = svd.fit_transform(X_train)

print(f"Transformed shape: {X_train_transformed.shape}")
print(f"Explained variance ratio: {svd.explained_variance_ratio_.sum():.4f}")

X_val_transformed = svd.transform(X_val)
print(f"✓ Validation data transformed: {X_val_transformed.shape}")

TruncatedSVD Feature Selection for Sparse Data

Method 1: TruncatedSVD Feature Importance

Fitting TruncatedSVD with 1024 components...
Transformed shape: (55000, 1024)
Explained variance ratio: 0.8323
✓ Validation data transformed: (5000, 1024)


In [13]:
X_test_transformed = svd.transform(X_test)
print(f"✓ Test data transformed: {X_test_transformed.shape}")

✓ Test data transformed: (5000, 1024)


In [14]:
import pandas as pd

X_train_df = pd.DataFrame(X_train_transformed)

# Labels
y_train_df = pd.DataFrame(y_train, columns=[f"label_{i}" for i in range(y_train.shape[1])])

# Save CSVs
X_train_df.to_csv("X_train.csv", index=False)
y_train_df.to_csv("y_train.csv", index=False)


In [15]:
import pandas as pd

X_val_df = pd.DataFrame(X_val_transformed)

# Labels
y_val_df = pd.DataFrame(y_val, columns=[f"label_{i}" for i in range(y_train.shape[1])])

# Save CSVs
X_val_df.to_csv("X_val.csv", index=False)
y_val_df.to_csv("y_val.csv", index=False)


In [None]:
import pandas as pd

X_test_df = pd.DataFrame(X_test_transformed)

# Labels
y_test_df = pd.DataFrame(y_test, columns=[f"label_{i}" for i in range(y_test.shape[1])])

# Save CSVs 
X_test_df.to_csv("X_test.csv", index=False)
y_test_df.to_csv("y_test.csv", index=False)



In [20]:
joblib.dump(tfidf_vectorizer, "tfidf.joblib")
joblib.dump(svd, "svd.joblib")

['svd.joblib']