In [30]:
import pandas as pd
import numpy as np
import re
import ast
import umap
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD

import warnings

warnings.filterwarnings("ignore")

In [31]:
df = pd.read_csv("Original Data/Train.csv")
print(df.info())
df["trainer"] = df["trainer"].apply(lambda x: ast.literal_eval(x)[0])
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13536 entries, 0 to 13535
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ID                       13536 non-null  object
 1   farmer_name              13536 non-null  object
 2   training_day             13536 non-null  object
 3   gender                   13536 non-null  object
 4   registration             13536 non-null  object
 5   age                      13536 non-null  object
 6   group_name               13536 non-null  object
 7   belong_to_cooperative    13536 non-null  int64 
 8   county                   13536 non-null  object
 9   subcounty                13536 non-null  object
 10  ward                     13536 non-null  object
 11  adopted_within_07_days   13536 non-null  int64 
 12  adopted_within_90_days   13536 non-null  int64 
 13  adopted_within_120_days  13536 non-null  int64 
 14  has_topic_trained_on     13536 non-nul

In [32]:
prior_df = pd.read_csv("Original Data/Prior.csv")
print(prior_df.info())
print(f"\nOriginal columns: {prior_df.columns.tolist()}")
print(prior_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44882 entries, 0 to 44881
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ID                       44882 non-null  object
 1   farmer_name              44882 non-null  object
 2   training_day             44882 non-null  object
 3   gender                   44882 non-null  object
 4   registration             44882 non-null  object
 5   age                      44882 non-null  object
 6   group_name               44882 non-null  object
 7   belong_to_cooperative    44882 non-null  int64 
 8   county                   44882 non-null  object
 9   subcounty                44882 non-null  object
 10  ward                     44882 non-null  object
 11  adopted_within_07_days   44882 non-null  int64 
 12  adopted_within_90_days   44882 non-null  int64 
 13  adopted_within_120_days  44882 non-null  int64 
 14  has_topic_trained_on     44882 non-nul

In [33]:
combined_df = pd.concat([df, prior_df], ignore_index=True)
df = combined_df.copy()
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58418 entries, 0 to 58417
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ID                       58418 non-null  object
 1   farmer_name              58418 non-null  object
 2   training_day             58418 non-null  object
 3   gender                   58418 non-null  object
 4   registration             58418 non-null  object
 5   age                      58418 non-null  object
 6   group_name               58418 non-null  object
 7   belong_to_cooperative    58418 non-null  int64 
 8   county                   58418 non-null  object
 9   subcounty                58418 non-null  object
 10  ward                     58418 non-null  object
 11  adopted_within_07_days   58418 non-null  int64 
 12  adopted_within_90_days   58418 non-null  int64 
 13  adopted_within_120_days  58418 non-null  int64 
 14  has_topic_trained_on     58418 non-nul

In [34]:
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import pandas as pd
import ast

def preprocess_data(df):
    def clean_and_flat_topics(topic_str):
        if not isinstance(topic_str, str) or pd.isna(topic_str):
            return []
        
        try:
            parsed = ast.literal_eval(topic_str)
        except (ValueError, SyntaxError):
            return []

        flat_topics = []
        def flatten(item):
            if isinstance(item, list):
                for sub in item:
                    flatten(sub)
            elif isinstance(item, str):
                flat_topics.append(item)
        flatten(parsed)
        cleaned = sorted(list(set([t.lower().strip() for t in flat_topics if t])))
        return cleaned

    # --- Topics ---
    df['clean_topics'] = df['topics_list'].apply(clean_and_flat_topics)
    mlb = MultiLabelBinarizer()
    topics_encoded = mlb.fit_transform(df['clean_topics'])
    topic_columns = [f'topic_{t}' for t in mlb.classes_]
    topics_df = pd.DataFrame(topics_encoded, columns=topic_columns, index=df.index)
    df = pd.concat([df, topics_df], axis=1)

    # --- Date features ---
    df['training_day'] = pd.to_datetime(df['training_day'], dayfirst=True)
    df['training_year'] = df['training_day'].dt.year
    df['training_month'] = df['training_day'].dt.month
    df['training_day_number'] = df['training_day'].dt.day
    df['training_dayofweek'] = df['training_day'].dt.dayofweek

    # --- Targets & Features ---
    TARGETS = [    
        'adopted_within_07_days',
        'adopted_within_90_days',
        'adopted_within_120_days',
    ]

    exclude_cols = ['ID', 'farmer_name', 'training_day', 'topics_list', 'clean_topics'] + TARGETS
    FEATURES = [c for c in df.columns if c not in exclude_cols]

    # --- Categorical encoding ---
    categorical_cols = [c for c in df.select_dtypes(include=["object"]).columns if c in FEATURES]
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = df[col].astype(str).fillna("NA")
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # --- Keep only features + targets ---
    df = df[FEATURES + TARGETS].copy()

    print(f"Total features: {len(FEATURES)}")
    print(f"  - Topic features: {len([f for f in FEATURES if f.startswith('topic_')])}")
    print(f"  - Other features: {len([f for f in FEATURES if not f.startswith('topic_')])}")
    print(f"Categorical columns encoded: {len(categorical_cols)}")
    print(f"Final dataframe shape: {df.shape}")

    # --- RETURN EVERYTHING NEEDED OUTSIDE ---
    return df, FEATURES, topic_columns, mlb, label_encoders, TARGETS

# --- Usage ---
df, FEATURES, topic_columns, mlb, label_encoders, TARGETS = preprocess_data(combined_df)
print(df.head())


Total features: 163
  - Topic features: 149
  - Other features: 14
Categorical columns encoded: 8
Final dataframe shape: (58418, 166)
   gender  registration  age  group_name  belong_to_cooperative  county  \
0       0             0    0         942                      0       3   
1       0             0    0         956                      1       0   
2       0             0    0         972                      0       0   
3       0             0    0         600                      0       0   
4       0             1    1         942                      0       3   

   subcounty  ward  has_topic_trained_on  trainer  ...  \
0         16    40                     0        6  ...   
1         17     1                     1        4  ...   
2         17     1                     1        4  ...   
3         17     1                     1        4  ...   
4         16    40                     0        6  ...   

   topic_weed management in maize and beans  \
0                  

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer

# Download required NLTK data (run once)
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
    nltk.download('omw-1.4')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# --- TF-IDF Configuration (NO SVD!) ---
tfidf_max_features_options = [30, 50, 75, 100, 150, 200]

# Targets
target_mapping = {
    '7 Days': 'adopted_within_07_days',
    '90 Days': 'adopted_within_90_days',
    '120 Days': 'adopted_within_120_days'
}

models = {}
tfidf_vectorizers = {}
best_configs = {}
chained_features = []

# --- Custom stopwords for agricultural domain ---
custom_stopwords = list(set(ENGLISH_STOP_WORDS).union({
    'how', 'to', 'from', 'with', 'your', 'for', 'the', 'and', 'in', 'of', 'a', 'an',
    'day', 'old', 'care', 'using', 'about', 'on', 'at', 'by', 'after', 'before',
    'week', 'weeks', 'maturity', 'products', 'product', 'use', 'uses', 'used',
    'new', 'best', 'good', 'better', 'right', 'proper', 'important', 'importance'
}))

# --- Step 1: Convert binary topic features to cleaned and lemmatized text ---
print("Converting topic features to text with preprocessing and lemmatization...")

def lemmatize_text(text):
    """Lemmatize text to normalize word forms"""
    words = text.split()
    lemmatized = []
    
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='n')
        if lemma == word:
            lemma = lemmatizer.lemmatize(word, pos='v')
        lemmatized.append(lemma)
    
    return ' '.join(lemmatized)

def topics_to_text(row):
    """Convert binary topic features to cleaned and lemmatized text"""
    active_topics = [col.replace('topic_', '') for col in topic_columns if row[col] == 1]
    
    if not active_topics:
        return 'no_topics'
    
    text = ' '.join(active_topics)
    text = text.lower()
    text = text.replace('(', '').replace(')', '')
    text = text.replace('-', ' ')
    text = text.replace('_', ' ')
    text = lemmatize_text(text)
    
    return text

df['topic_text'] = df[topic_columns].apply(topics_to_text, axis=1)

print(f"\nSample topic texts (after cleaning and lemmatization):")
for i in range(min(5, len(df))):
    text = df['topic_text'].iloc[i]
    preview = text[:100] + '...' if len(text) > 100 else text
    print(f"  {i+1}. {preview}")

print(f"\nTopic text statistics:")
topic_lengths = df['topic_text'].str.split().str.len()
print(f"  Words per farmer: min={topic_lengths.min()}, max={topic_lengths.max()}, mean={topic_lengths.mean():.1f}")
print(f"  Farmers with no topics: {(df['topic_text'] == 'no_topics').sum()}")

# --- Get non-topic features for base model ---
base_features = [f for f in FEATURES if not f.startswith('topic_')]
print(f"\nBase features (non-topic): {len(base_features)}")
print(f"Topic features: {len(topic_columns)}")

# --- Step 2: Train models with different TF-IDF configurations (NO SVD) ---
df_predictions = df.copy()

for period, target in target_mapping.items():
    print(f"\n{'='*70} Training for {period} {'='*70}")

    # Include previous PREDICTED probabilities as features
    X_full = df_predictions[base_features + chained_features].copy()
    y = df[target]

    # Split train/test for evaluation
    X_train_base, X_test_base, y_train, y_test = train_test_split(
        X_full, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Get topic text for train/test
    train_topic_text = df.loc[X_train_base.index, 'topic_text']
    test_topic_text = df.loc[X_test_base.index, 'topic_text']

    # --- Evaluate different TF-IDF configurations (NO SVD COMPRESSION) ---
    results = []
    trained_models = []
    
    for max_features in tfidf_max_features_options:
        print(f"\n--- TF-IDF max_features={max_features} (NO SVD compression) ---")
        
        # Fit TF-IDF on training data
        tfidf = TfidfVectorizer(
            max_features=max_features,
            ngram_range=(1, 2),
            min_df=3,
            max_df=0.85,
            sublinear_tf=True,
            stop_words=custom_stopwords,
            lowercase=True,
            token_pattern=r'\b[a-z]{3,}\b'
        )
        
        X_train_tfidf = tfidf.fit_transform(train_topic_text)
        X_test_tfidf = tfidf.transform(test_topic_text)
        
        print(f"  TF-IDF shape: {X_train_tfidf.shape}")
        print(f"  Sparsity: {(1 - X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1])):.2%}")
        print(f"  Top features: {list(tfidf.get_feature_names_out())[:10]}")
        
        # Convert sparse matrix to dense
        X_train_tfidf_dense = X_train_tfidf.toarray()
        X_test_tfidf_dense = X_test_tfidf.toarray()
        
        # Combine base features + TF-IDF features
        X_train_final = np.hstack([X_train_base.values, X_train_tfidf_dense])
        X_test_final = np.hstack([X_test_base.values, X_test_tfidf_dense])
        
        print(f"  Final feature count: {X_train_final.shape[1]} (base: {X_train_base.shape[1]}, tfidf: {X_train_tfidf_dense.shape[1]})")

        # Train calibrated RandomForest
        rf = RandomForestClassifier(
            n_estimators=800,
            max_features="sqrt",
            min_samples_leaf=3,
            class_weight="balanced_subsample",
            random_state=42,
            n_jobs=-1
        )

        calibrated_rf = CalibratedClassifierCV(
            estimator=rf,
            method="sigmoid",
            cv=3
        )
        
        print(f"  Training model...", end=' ')
        calibrated_rf.fit(X_train_final, y_train)

        # Evaluate
        y_pred_proba = calibrated_rf.predict_proba(X_test_final)[:, 1]
        test_auc = roc_auc_score(y_test, y_pred_proba)
        test_logloss = log_loss(y_test, y_pred_proba)
        
        print(f"AUC={test_auc:.4f}, LogLoss={test_logloss:.4f}")
        
        results.append({
            'max_features': max_features,
            'test_auc': test_auc,
            'test_logloss': test_logloss,
            'n_tfidf_features': X_train_tfidf_dense.shape[1]
        })
        
        trained_models.append({
            'max_features': max_features,
            'model': calibrated_rf,
            'tfidf': tfidf
        })
    
    # --- Display results and select best based on log loss ---
    results_df = pd.DataFrame(results)
    print("\n" + "="*80)
    print("TF-IDF Configuration Comparison (NO SVD):")
    print(results_df.sort_values('test_logloss').to_string(index=False))
    print("="*80)
    
    # Select best based on log loss
    best_idx = results_df['test_logloss'].idxmin()
    best_result = results_df.loc[best_idx]
    
    print(f"\n✓ Selected Configuration:")
    print(f"  TF-IDF max_features: {int(best_result['max_features'])}")
    print(f"  Actual TF-IDF features created: {int(best_result['n_tfidf_features'])}")
    print(f"  Test AUC: {best_result['test_auc']:.4f}")
    print(f"  Test Log Loss: {best_result['test_logloss']:.4f}")

    # Store the best model and transformers
    best_model_info = trained_models[best_idx]
    models[period] = best_model_info['model']
    tfidf_vectorizers[period] = best_model_info['tfidf']
    best_configs[period] = {
        'max_features': int(best_result['max_features'])
    }
    
    # --- Final evaluation with best model ---
    tfidf = best_model_info['tfidf']
    
    # Recreate the TF-IDF features for the best model
    X_train_tfidf_best = tfidf.transform(train_topic_text).toarray()
    X_test_tfidf_best = tfidf.transform(test_topic_text).toarray()
    
    # Recreate final features
    X_train_final_best = np.hstack([X_train_base.values, X_train_tfidf_best])
    X_test_final_best = np.hstack([X_test_base.values, X_test_tfidf_best])
    
    print(f"\n{'='*30} Final Test Results {'='*30}")
    y_pred = best_model_info['model'].predict(X_test_final_best)
    y_pred_proba = best_model_info['model'].predict_proba(X_test_final_best)[:, 1]

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print(f"\nTest AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print(f"Test Log Loss: {log_loss(y_test, y_pred_proba):.4f}")
    
    # Show top TF-IDF features by feature importance (COMPLETELY FIXED)
    print(f"\nTop 20 most important TF-IDF features (by Random Forest importance):")
    
    # Use the recreated arrays to get correct dimensions
    actual_n_base = X_train_base.shape[1]
    actual_n_tfidf = X_train_tfidf_best.shape[1]
    
    # Get feature importances from the base estimator
    feature_importances = best_model_info['model'].calibrated_classifiers_[0].estimator.feature_importances_
    
    # Extract just the TF-IDF portion using actual array dimensions
    tfidf_importances = feature_importances[actual_n_base:actual_n_base + actual_n_tfidf]
    tfidf_feature_names = tfidf.get_feature_names_out()
    
    # Verify lengths match
    if len(tfidf_importances) != len(tfidf_feature_names):
        print(f"  WARNING: Mismatch - {len(tfidf_importances)} importances vs {len(tfidf_feature_names)} names")
        print(f"  Skipping feature importance display.")
    else:
        # Show top features
        n_features_to_show = min(20, len(tfidf_feature_names))
        top_indices = tfidf_importances.argsort()[-n_features_to_show:][::-1]
        
        for idx in top_indices:
            print(f"  - {tfidf_feature_names[idx]}: {tfidf_importances[idx]:.6f}")

    # --- Generate predictions on FULL dataset for next period ---
    print(f"\nGenerating predicted probabilities on full dataset for chaining...")
    
    full_topic_text = df['topic_text']
    full_tfidf = tfidf.transform(full_topic_text).toarray()
    
    X_full_final = np.hstack([df_predictions[base_features + chained_features].values, full_tfidf])
    
    full_predictions = best_model_info['model'].predict_proba(X_full_final)[:, 1]
    
    df_predictions[target] = full_predictions
    chained_features.append(target)
    
    print(f"  Added '{target}' predictions to feature set")
    print(f"  Prediction stats: min={full_predictions.min():.4f}, max={full_predictions.max():.4f}, mean={full_predictions.mean():.4f}")

print("\n" + "="*80)
print("Training Complete! Stored:")
print(f"  models: {list(models.keys())}")
print(f"  tfidf_vectorizers: {list(tfidf_vectorizers.keys())}")
print(f"  best_configs: {best_configs}")
print(f"  chained_features: {chained_features}")
print("="*80)

[nltk_data] Downloading package wordnet to /home/nzioka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nzioka/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Converting topic features to text with preprocessing and lemmatization...

Sample topic texts (after cleaning and lemmatization):
  1. ndume app poultry feed
  2. poultry house
  3. asili fertilizer organic biosecurity in poultry farm calf feed dairy health management dairy nutriti...
  4. poultry house poultry product record keep in dairy
  5. ndume app poultry feed

Topic text statistics:
  Words per farmer: min=2, max=161, mean=6.8
  Farmers with no topics: 0

Creating engineered features from topics...

Engineered features created:
  Livestock topics found: 86
  Crop topics found: 21
  Business topics found: 43
  Health topics found: 40
  Feed topics found: 27

Total engineered features: 26

Sample engineered features:
   topic_count  topic_diversity  livestock_count  crop_count  business_count  \
0            2         0.013423                1           0               0   
1            1         0.006711                1           0               0   
2           29         0.19

In [36]:
# from xgboost import XGBClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, log_loss

# target_mapping = {
#     "7 Days": "adopted_within_07_days",
#     "90 Days": "adopted_within_90_days",
#     "120 Days": "adopted_within_120_days"
# }

# models = {}

# # For each horizon
# for period, target in target_mapping.items():
#     print(f"\n{'='*20} Training XGBoost for {period} {'='*20}")

#     X = df[FEATURES].copy()
#     y = df[target]

#     # Add predictions from previous horizons as features
#     for prev_period, prev_target in list(target_mapping.items()):
#         if prev_period == period:
#             break
#         X[f"pred_{prev_target}"] = df[prev_target]  # Using actual labels for training

#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.2, stratify=y, random_state=42
#     )

#     xgb_model = XGBClassifier(
#         n_estimators=1000,
#         learning_rate=0.03,
#         max_depth=6,
#         subsample=0.8,
#         colsample_bytree=0.8,
#         eval_metric="logloss",
#         use_label_encoder=False,
#         random_state=42,
#         n_jobs=-1
#     )

#     xgb_model.fit(
#         X_train, y_train,
#         eval_set=[(X_test, y_test)],
#         early_stopping_rounds=50,
#         verbose=False
#     )

#     models[period] = xgb_model

#     y_pred = xgb_model.predict(X_test)
#     y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

#     print("\nClassification Report:")
#     print(classification_report(y_test, y_pred))
#     print("Confusion Matrix:")
#     print(confusion_matrix(y_test, y_pred))
#     print(f"AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
#     print(f"Log Loss: {log_loss(y_test, y_pred_proba):.4f}")


In [None]:
import pandas as pd
import ast
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# -------------------------
# Functions (MUST MATCH TRAINING)
# -------------------------
def clean_and_flat_topics(topic_str):
    if not isinstance(topic_str, str) or pd.isna(topic_str):
        return []
    
    try:
        parsed = ast.literal_eval(topic_str)
    except (ValueError, SyntaxError):
        return []

    flat_topics = []
    def flatten(item):
        if isinstance(item, list):
            for sub in item:
                flatten(sub)
        elif isinstance(item, str):
            flat_topics.append(item)
    flatten(parsed)
    cleaned = sorted(list(set([t.lower().strip() for t in flat_topics if t])))
    return cleaned

def lemmatize_text(text):
    """Lemmatize text to normalize word forms"""
    words = text.split()
    lemmatized = []
    
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='n')
        if lemma == word:
            lemma = lemmatizer.lemmatize(word, pos='v')
        lemmatized.append(lemma)
    
    return ' '.join(lemmatized)

def topics_to_text(row):
    """Convert binary topic features to cleaned and lemmatized text"""
    active_topics = [col.replace('topic_', '') for col in topic_columns if row[col] == 1]
    
    if not active_topics:
        return 'no_topics'
    
    text = ' '.join(active_topics)
    text = text.lower()
    text = text.replace('(', '').replace(')', '')
    text = text.replace('-', ' ')
    text = text.replace('_', ' ')
    text = lemmatize_text(text)
    
    return text

# -------------------------
# Load test data
# -------------------------
test_df = pd.read_csv("Original Data/Test.csv")

# -------------------------
# Preprocess topics
# -------------------------
test_df['clean_topics'] = test_df['topics_list'].apply(clean_and_flat_topics)

topics_encoded = mlb.transform(test_df['clean_topics'])
topic_columns_list = [f'topic_{t}' for t in mlb.classes_]
topics_df = pd.DataFrame(topics_encoded, columns=topic_columns_list, index=test_df.index)
test_df = pd.concat([test_df, topics_df], axis=1)

test_df['topic_text'] = test_df[topic_columns].apply(topics_to_text, axis=1)

print(f"\nTest data topic text samples (lemmatized):")
for i in range(min(3, len(test_df))):
    text = test_df['topic_text'].iloc[i]
    preview = text[:100] + '...' if len(text) > 100 else text
    print(f"  {i+1}. {preview}")

# -------------------------
# Date features
# -------------------------
if 'training_day' in test_df.columns:
    test_df['training_day'] = pd.to_datetime(test_df['training_day'], dayfirst=True)
    test_df['training_year'] = test_df['training_day'].dt.year
    test_df['training_month'] = test_df['training_day'].dt.month
    test_df['training_day_number'] = test_df['training_day'].dt.day
    test_df['training_dayofweek'] = test_df['training_day'].dt.dayofweek

# -------------------------
# Encode categorical features safely
# -------------------------
base_features = [f for f in FEATURES if not f.startswith('topic_')]

for col in base_features:
    if col in test_df.columns:
        if col in label_encoders:
            le = label_encoders[col]
            test_df[col] = test_df[col].astype(str).fillna("NA")
            test_df[col] = test_df[col].map(lambda s: s if s in le.classes_ else le.classes_[0])
            test_df[col] = le.transform(test_df[col])
        else:
            if test_df[col].dtype == "object":
                test_df[col] = 0

for feat in base_features:
    if feat not in test_df.columns:
        test_df[feat] = 0

X_test_base = test_df[base_features].copy()

# -------------------------
# Prepare submission
# -------------------------
submission = pd.DataFrame()
submission["ID"] = test_df["ID"] if "ID" in test_df.columns else range(1, len(test_df) + 1)

submission_mapping = {
    "7 Days": ["Target_07_AUC", "Target_07_LogLoss"],
    "90 Days": ["Target_90_AUC", "Target_90_LogLoss"],
    "120 Days": ["Target_120_AUC", "Target_120_LogLoss"]
}

# -------------------------
# Generate chained predictions
# -------------------------
print("\nGenerating chained predictions with TF-IDF (NO SVD, using predicted probabilities)...")

for period, target in [
    ("7 Days", "adopted_within_07_days"),
    ("90 Days", "adopted_within_90_days"),
    ("120 Days", "adopted_within_120_days")
]:
    print(f"\nPredicting for {period}...")
    
    model = models[period]
    tfidf = tfidf_vectorizers[period]
    config = best_configs[period]
    
    print(f"  Using TF-IDF (max_features={config['max_features']}, NO SVD compression)")
    
    # Transform test data
    X_test_tfidf = tfidf.transform(test_df['topic_text']).toarray()
    
    # Combine base features + TF-IDF features
    X_test_final = np.hstack([X_test_base.values, X_test_tfidf])

    # Predict probabilities
    probs = model.predict_proba(X_test_final)[:, 1]
    
    print(f"  Predicted probabilities: min={probs.min():.4f}, max={probs.max():.4f}, mean={probs.mean():.4f}")

    # Fill submission columns
    for col in submission_mapping[period]:
        submission[col] = probs

    # Append PREDICTED probability as feature for next horizon
    X_test_base[target] = probs

# -------------------------
# Save submission
# -------------------------
submission.to_csv("submission_tfidf_no_svd.csv", index=False)
print("\nDone! Submission saved to submission_tfidf_no_svd.csv")
print(submission.head())


Creating engineered features for test data...
Engineered features created for test data!

Generating chained predictions with TF-IDF + Engineered Features...

Predicting for 7 Days...
  Using TF-IDF (max_features=100) + 26 engineered features
  Predicted probabilities: min=0.0022, max=0.2060, mean=0.0047

Predicting for 90 Days...
  Using TF-IDF (max_features=100) + 26 engineered features
  Predicted probabilities: min=0.0046, max=0.2700, mean=0.0081

Predicting for 120 Days...
  Using TF-IDF (max_features=100) + 26 engineered features
  Predicted probabilities: min=0.0068, max=0.3273, mean=0.0098

Done! Submission saved to submission_tfidf_engineered.csv
          ID  Target_07_AUC  Target_07_LogLoss  Target_90_AUC  \
0  ID_LEG1GM       0.002818           0.002818       0.005615   
1  ID_1UKOKW       0.002413           0.002413       0.005454   
2  ID_U5H2YK       0.006509           0.006509       0.008666   
3  ID_55957A       0.005612           0.005612       0.007668   
4  ID_N1AC