In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_features(df):
    """Splits data and applies TF-IDF Vectorization with bi-grams."""
    # --- Data Preparation ---
    X = df['content']
    y = df['label']

    # --- Initial Train/Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"Total Training Samples: {X_train.shape[0]}, Total Testing Samples: {X_test.shape[0]}")
    
    # --- TF-IDF Vectorization ---
    # Using max_features=5000 and ngram_range=(1, 2) as specified in your notebook
    print("\nApplying TF-IDF Vectorization (Max Features 5000, N-gram 1-2)...")
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    print("Feature extraction complete.")
    
    # Returns the vectorized data splits and the fitted vectorizer object
    return X_train_tfidf, X_test_tfidf, y_train, y_test, tfidf_vectorizer