In [1]:
import pandas as pd
import numpy as np  
import ast
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss,confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

In [2]:
df = pd.read_csv("Original Data/Train.csv")

In [3]:
df["days_to_second_training"] = df["days_to_second_training"].fillna(-1)

In [4]:
def parse_topics(topic_str):
    if not isinstance(topic_str, str):
        return []
    try:
        raw_list = ast.literal_eval(topic_str)
        final_topics = []
        for item in raw_list:
            if ',' in item: # Handle "Topic A,Topic B" inside one string
                split_items = [x.strip() for x in item.split(',')]
                final_topics.extend(split_items)
            else:
                final_topics.append(item.strip())
        return final_topics
    except:
        return []

df['clean_topics'] = df['topics_list'].apply(parse_topics)
mlb = MultiLabelBinarizer()
topics_encoded = mlb.fit_transform(df['clean_topics'])
topics_df = pd.DataFrame(topics_encoded, columns=mlb.classes_, index=df.index)
df = pd.concat([df, topics_df], axis=1)
print(f"Created {len(mlb.classes_)} new topic features.")
print("Sample columns:", list(mlb.classes_)[:5])

Created 157 new topic features.
Sample columns: ['Advantages Of A.I /Disadvantages Of Natural Mating', 'Aflatoxin In Dairy Farming', 'Aflatoxin Mitigation In Dairy Herds', 'Aflatoxin Mitigation Through Good Agricultural Practices', 'Animal Management Practices']


In [5]:
# 1. Define specific features and targets
TARGETS = [    
    'adopted_within_07_days',
    'adopted_within_90_days',
    'adopted_within_120_days',
]

exclude_cols = ['ID', 'first_training_date', 'topics_list'] + TARGETS

FEATURES = [c for c in df.columns if c not in exclude_cols]

df = df[FEATURES + TARGETS].copy()

categorical_cols = [c for c in df.select_dtypes(include=["object"]).columns if c in FEATURES]

print("Categorical cols:", categorical_cols)

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = df[col].astype(str).fillna("NA")
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
print(df.columns.tolist())

Categorical cols: ['gender', 'age', 'registration', 'county', 'subcounty', 'ward', 'trainer', 'clean_topics']
['gender', 'age', 'registration', 'belong_to_cooperative', 'county', 'subcounty', 'ward', 'trainer', 'num_total_trainings', 'num_repeat_trainings', 'days_to_second_training', 'num_unique_trainers', 'has_second_training', 'clean_topics', 'Advantages Of A.I /Disadvantages Of Natural Mating', 'Aflatoxin In Dairy Farming', 'Aflatoxin Mitigation In Dairy Herds', 'Aflatoxin Mitigation Through Good Agricultural Practices', 'Animal Management Practices', 'Animal Nutrition With Pembe', 'Antimicrobial Resistance', 'Asili Fertilizer', 'Asili Fertilizer (Organic)', 'Benefits Of Sistema Biogas', 'Benfits Of Sistema Biogas', 'Benfits Of Sistema Biogas.', 'Biodeal Dairy', 'Biodeal Poultry', 'Biosecurity In Poultry Farming', 'Broiler Feed Formulation', 'Calf Feeding', 'Calf Rearing For October', 'Causes Of Infertility In Dairy', 'Clean Energy With Sistema Biogas', 'Control Of External Parasite

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_cols = [c for c in df.select_dtypes(include=["object"]).columns if c in FEATURES]


In [6]:
df.head()

Unnamed: 0,gender,age,registration,belong_to_cooperative,county,subcounty,ward,trainer,num_total_trainings,num_repeat_trainings,...,Unga Dairy Feeds,Unga Feed For Layers,Weed Management,Weed Management In Crop,Weed Management In Maize And Beans,Why You Should Vaccinate Your Animals,Yara Maziwa Pro,adopted_within_07_days,adopted_within_90_days,adopted_within_120_days
0,0,1,1,0,1,2,22,2,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,1,2,22,2,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,6,24,1,4,39,38,...,0,0,0,0,0,1,0,1,1,1
3,0,0,0,0,6,24,1,4,301,300,...,0,0,0,0,0,1,1,1,1,1
4,0,0,1,0,6,24,1,4,19,18,...,0,0,0,0,0,1,0,0,0,0


In [7]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, log_loss

target_mapping = {
    '7 Days': 'adopted_within_07_days',
    '90 Days': 'adopted_within_90_days',
    '120 Days': 'adopted_within_120_days'
}

models = {}  # Dictionary to store trained models

# Define the hyperparameter grid
param_grid = {
    'estimator__max_depth': [3, 5, 7, 10, None],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 5, 10],
    'estimator__max_features': [None, 'sqrt', 'log2']
}

for period, target in target_mapping.items():
    print(f"\n{'='*20} Evaluating for {period} {'='*20}")
    
    X = df[FEATURES]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Base Decision Tree
    dt = DecisionTreeClassifier(random_state=42)
    
    # Wrap in a calibrated classifier
    calibrated_dt = CalibratedClassifierCV(estimator=dt, method='sigmoid', cv=5)
    
    # Grid search to tune hyperparameters
    grid = GridSearchCV(
        estimator=calibrated_dt,
        param_grid=param_grid,
        scoring='neg_log_loss',  # maximize probability quality
        cv=3,
        verbose=0,
        n_jobs=-1
    )
    
    grid.fit(X_train, y_train)
    
    print(f"Best Parameters: {grid.best_params_}")
    
    # ------------------------------
    # Train final model on full dataset using best params
    # ------------------------------
    final_dt = DecisionTreeClassifier(
        random_state=42,
        max_depth=grid.best_params_['estimator__max_depth'],
        min_samples_split=grid.best_params_['estimator__min_samples_split'],
        min_samples_leaf=grid.best_params_['estimator__min_samples_leaf'],
        max_features=grid.best_params_['estimator__max_features']
    )
    
    final_calibrated = CalibratedClassifierCV(estimator=final_dt, method='sigmoid', cv=5)
    final_calibrated.fit(X, y)  # train on full dataset
    
    models[period] = final_calibrated  # replace with full-data model
    
    # Predictions on the previous test split (optional)
    y_pred = final_calibrated.predict(X_test)
    y_pred_proba = final_calibrated.predict_proba(X_test)[:, 1]
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC Score: {auc:.4f}")
    
    loss = log_loss(y_test, y_pred_proba)
    print(f"Log Loss: {loss:.4f}")



Best Parameters: {'estimator__max_depth': None, 'estimator__max_features': None, 'estimator__min_samples_leaf': 10, 'estimator__min_samples_split': 2}
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       981
           1       0.89      0.66      0.76       129

    accuracy                           0.95      1110
   macro avg       0.92      0.82      0.86      1110
weighted avg       0.95      0.95      0.95      1110

Confusion Matrix:
[[970  11]
 [ 44  85]]
AUC Score: 0.9842
Log Loss: 0.1389

Best Parameters: {'estimator__max_depth': None, 'estimator__max_features': None, 'estimator__min_samples_leaf': 10, 'estimator__min_samples_split': 2}
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       963
           1       0.92      0.61      0.73       147

    accuracy                           0.94      1110
   macro avg       0.93      

In [None]:
def preprocess_data(df_test):
    df_test = df_test.copy()
    
    
    # 2. Re-create topic features
    # We need to apply the SAME transformation as training
    # Assumes 'parse_topics' and 'mlb' from previous cells are available in scope
    if 'topics_list' in df_test.columns:
        df_test['clean_topics'] = df_test['topics_list'].apply(parse_topics)
        # Use transform, NOT fit_transform, to keep same columns
        topics_encoded = mlb.transform(df_test['clean_topics'])
        topics_df = pd.DataFrame(topics_encoded, columns=mlb.classes_, index=df_test.index)
        df_test = pd.concat([df_test, topics_df], axis=1)
    
    # 3. Encode categorical properties
    for col, le in label_encoders.items():
        if col in df_test.columns:
            df_test[col] = df_test[col].astype(str).fillna("NA")
            # Handle unseen categories
            df_test[col] = df_test[col].map(lambda s: s if s in le.classes_ else le.classes_[0])
            df_test[col] = le.transform(df_test[col])
            
    return df_test

test_df = pd.read_csv("Original Data/Test.csv")

test_df = preprocess_data(test_df)
X_test_sub = test_df[FEATURES]

# Read Sample Submission to get correct format
submission = pd.read_csv("Original Data/SampleSubmission.csv")

# Map our target names to the submission columns
# Structure: Target_07_AUC, Target_07_LogLoss, etc.
submission_mapping = {
    '7 Days': ['Target_07_AUC', 'Target_07_LogLoss'],
    '90 Days': ['Target_90_AUC', 'Target_90_LogLoss'],
    '120 Days': ['Target_120_AUC', 'Target_120_LogLoss']
}

print("Generating predictions...")
for period, cols in submission_mapping.items():
    if period in models:
        model = models[period]
        probs = model.predict_proba(X_test_sub)[:, 1]
        
        # Fill both AUC and LogLoss columns with the probability
        for col in cols:
            submission[col] = probs
            
print("Saving submission.csv...")
submission.to_csv('submission.csv', index=False)
print("Done!")
print(submission.head())



Generating predictions...
Saving submission.csv...
Done!
          ID  Target_07_AUC  Target_90_AUC  Target_120_AUC  Target_07_LogLoss  \
0  ID_OQZATP       0.036362       0.040490        0.040986           0.036362   
1  ID_2FAI5D       0.512364       0.436408        0.433624           0.512364   
2  ID_MYUWBW       0.473245       0.621085        0.649654           0.473245   
3  ID_EUB4FR       0.079942       0.155769        0.214911           0.079942   
4  ID_AG7R5S       0.732104       0.640995        0.644515           0.732104   

   Target_90_LogLoss  Target_120_LogLoss  
0           0.040490            0.040986  
1           0.436408            0.433624  
2           0.621085            0.649654  
3           0.155769            0.214911  
4           0.640995            0.644515  
