In [7]:
import pandas as pd
import numpy as np  
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss,confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

import warnings

warnings.filterwarnings("ignore")

In [8]:
df = pd.read_csv("Original Data/Train.csv")


In [9]:
df["days_to_second_training"] = df["days_to_second_training"].fillna(-1)

In [10]:
def parse_topics(topic_str):
    if not isinstance(topic_str, str):
        return []
    try:
        raw_list = ast.literal_eval(topic_str)
        final_topics = []
        for item in raw_list:
            if ',' in item: # Handle "Topic A,Topic B" inside one string
                split_items = [x.strip() for x in item.split(',')]
                final_topics.extend(split_items)
            else:
                final_topics.append(item.strip())
        return final_topics
    except:
        return []

df['clean_topics'] = df['topics_list'].apply(parse_topics)
mlb = MultiLabelBinarizer()
topics_encoded = mlb.fit_transform(df['clean_topics'])
topics_df = pd.DataFrame(topics_encoded, columns=mlb.classes_, index=df.index)
df = pd.concat([df, topics_df], axis=1)
print(f"Created {len(mlb.classes_)} new topic features.")
print("Sample columns:", list(mlb.classes_)[:5])

Created 157 new topic features.
Sample columns: ['Advantages Of A.I /Disadvantages Of Natural Mating', 'Aflatoxin In Dairy Farming', 'Aflatoxin Mitigation In Dairy Herds', 'Aflatoxin Mitigation Through Good Agricultural Practices', 'Animal Management Practices']


In [11]:
# 1. Define specific features and targets
TARGETS = [    
    'adopted_within_07_days',
    'adopted_within_90_days',
    'adopted_within_120_days',
]

exclude_cols = ['ID', 'first_training_date', 'topics_list'] + TARGETS

FEATURES = [c for c in df.columns if c not in exclude_cols]

df = df[FEATURES + TARGETS].copy()

categorical_cols = [c for c in df.select_dtypes(include=["object"]).columns if c in FEATURES]

print("Categorical cols:", categorical_cols)

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = df[col].astype(str).fillna("NA")
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
print(df.columns.tolist())

Categorical cols: ['gender', 'age', 'registration', 'county', 'subcounty', 'ward', 'trainer', 'clean_topics']
['gender', 'age', 'registration', 'belong_to_cooperative', 'county', 'subcounty', 'ward', 'trainer', 'num_total_trainings', 'num_repeat_trainings', 'days_to_second_training', 'num_unique_trainers', 'has_second_training', 'clean_topics', 'Advantages Of A.I /Disadvantages Of Natural Mating', 'Aflatoxin In Dairy Farming', 'Aflatoxin Mitigation In Dairy Herds', 'Aflatoxin Mitigation Through Good Agricultural Practices', 'Animal Management Practices', 'Animal Nutrition With Pembe', 'Antimicrobial Resistance', 'Asili Fertilizer', 'Asili Fertilizer (Organic)', 'Benefits Of Sistema Biogas', 'Benfits Of Sistema Biogas', 'Benfits Of Sistema Biogas.', 'Biodeal Dairy', 'Biodeal Poultry', 'Biosecurity In Poultry Farming', 'Broiler Feed Formulation', 'Calf Feeding', 'Calf Rearing For October', 'Causes Of Infertility In Dairy', 'Clean Energy With Sistema Biogas', 'Control Of External Parasite

In [None]:
target_mapping = {
    '7 Days': 'adopted_within_07_days',
    '90 Days': 'adopted_within_90_days',
    '120 Days': 'adopted_within_120_days'
}

models = {}

# Hyperparameter grid (focused, not reckless)
param_grid = {
    "estimator__n_estimators": [300, 500],
    "estimator__min_samples_leaf": [3, 5, 10],
    "estimator__max_features": ["sqrt", 0.3, 0.5],
    "estimator__max_depth": [None, 20, 30],
    "estimator__bootstrap": [True],
    "estimator__max_samples": [0.7, 0.85, None]
}

for period, target in target_mapping.items():
    print(f"\n{'='*20} Evaluating for {period} {'='*20}")

    X = df[FEATURES]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    # Base RF
    rf = RandomForestClassifier(
        random_state=42,
        n_jobs=1
    )

    # Calibrated RF
    calibrated_rf = CalibratedClassifierCV(
        estimator=rf,
        method="sigmoid",
        cv=5
    )

    # Grid Search
    grid = GridSearchCV(
        estimator=calibrated_rf,
        param_grid=param_grid,
        scoring="neg_log_loss",
        cv=3,
        n_jobs=-1,
        verbose=0
    )

    grid.fit(X_train, y_train)

    # Best model
    best_model = grid.best_estimator_
    models[period] = best_model

    # Predictions
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    print(f"Best Parameters: {grid.best_params_}")

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC Score: {auc:.4f}")

    loss = log_loss(y_test, y_pred_proba)
    print(f"Log Loss: {loss:.4f}")




Best Parameters: {'estimator__bootstrap': True, 'estimator__max_depth': 20, 'estimator__max_features': 'sqrt', 'estimator__max_samples': None, 'estimator__min_samples_leaf': 3, 'estimator__n_estimators': 300}
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       987
           1       0.75      0.54      0.63       123

    accuracy                           0.93      1110
   macro avg       0.85      0.76      0.80      1110
weighted avg       0.92      0.93      0.92      1110

Confusion Matrix:
[[965  22]
 [ 56  67]]
AUC Score: 0.9660
Log Loss: 0.1638



In [None]:
def preprocess_data(df_test):
    df_test = df_test.copy()
    
    # 1. Fill missing values
    if 'days_to_second_training' in df_test.columns:
        df_test["days_to_second_training"] = df_test["days_to_second_training"].fillna(-1)
    
    # 2. Re-create topic features
    # We need to apply the SAME transformation as training
    # Assumes 'parse_topics' and 'mlb' from previous cells are available in scope
    if 'topics_list' in df_test.columns:
        df_test['clean_topics'] = df_test['topics_list'].apply(parse_topics)
        # Use transform, NOT fit_transform, to keep same columns
        topics_encoded = mlb.transform(df_test['clean_topics'])
        topics_df = pd.DataFrame(topics_encoded, columns=mlb.classes_, index=df_test.index)
        df_test = pd.concat([df_test, topics_df], axis=1)
    
    # 3. Encode categorical properties
    for col, le in label_encoders.items():
        if col in df_test.columns:
            df_test[col] = df_test[col].astype(str).fillna("NA")
            # Handle unseen categories
            df_test[col] = df_test[col].map(lambda s: s if s in le.classes_ else le.classes_[0])
            df_test[col] = le.transform(df_test[col])
            
    return df_test

test_df = pd.read_csv("Original Data/Test.csv")

test_df = preprocess_data(test_df)
X_test_sub = test_df[FEATURES]

# Read Sample Submission to get correct format
submission = pd.read_csv("Original Data/SampleSubmission.csv")

# Map our target names to the submission columns
# Structure: Target_07_AUC, Target_07_LogLoss, etc.
submission_mapping = {
    '7 Days': ['Target_07_AUC', 'Target_07_LogLoss'],
    '90 Days': ['Target_90_AUC', 'Target_90_LogLoss'],
    '120 Days': ['Target_120_AUC', 'Target_120_LogLoss']
}

print("Generating predictions...")
for period, cols in submission_mapping.items():
    if period in models:
        model = models[period]
        probs = model.predict_proba(X_test_sub)[:, 1]
        
        # Fill both AUC and LogLoss columns with the probability
        for col in cols:
            submission[col] = probs
            
print("Saving submission.csv...")
submission.to_csv('submission.csv', index=False)
print("Done!")
print(submission.head())



Generating predictions...
Saving submission.csv...
Done!
          ID  Target_07_AUC  Target_90_AUC  Target_120_AUC  Target_07_LogLoss  \
0  ID_OQZATP       0.019088       0.019712        0.022664           0.019088   
1  ID_2FAI5D       0.876818       0.892065        0.870636           0.876818   
2  ID_MYUWBW       0.670178       0.546260        0.610154           0.670178   
3  ID_EUB4FR       0.027110       0.040294        0.049385           0.027110   
4  ID_AG7R5S       0.908452       0.914366        0.908003           0.908452   

   Target_90_LogLoss  Target_120_LogLoss  
0           0.019712            0.022664  
1           0.892065            0.870636  
2           0.546260            0.610154  
3           0.040294            0.049385  
4           0.914366            0.908003  
