In [3]:
!pip install pandas numpy scikit-learn tensorflow




In [4]:
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
meta = pd.read_csv("metaData.csv")
wa = pd.read_csv("whatsapp_activity.csv")
bot = pd.read_csv("teleco_call_back.csv")
human = pd.read_csv("call_placed.csv")
field = pd.read_csv("mobile_app_data.csv")


In [5]:
wa_feat = wa.groupby("lead_code").agg(
    wa_sent=("status","count"),
    wa_read=("read_at", lambda x: x.notna().sum())
).reset_index()

wa_feat["wa_read_ratio"] = wa_feat["wa_read"] / wa_feat["wa_sent"]


In [6]:
wa_feat = wa.groupby("lead_code").agg(
    wa_msgs_sent=("status", "count"),
    wa_delivered=("status", lambda x: (x=="DELIVERED").sum()),
    wa_read=("read_at", lambda x: x.notna().sum()),
    wa_replies=("response_message", lambda x: x.notna().sum())
).reset_index()

wa_feat["wa_delivered_ratio"] = wa_feat["wa_delivered"] / wa_feat["wa_msgs_sent"]
wa_feat["wa_read_ratio"] = wa_feat["wa_read"] / wa_feat["wa_msgs_sent"]


In [7]:
bot_feat = bot.groupby("lead_code").agg(
    bot_calls=("disposition", "count"),
    bot_answered=("disposition", lambda x: (x=="ANSWERED").sum()),
    avg_bot_duration=("duration", "mean")
).reset_index()

bot_feat["bot_answer_ratio"] = bot_feat["bot_answered"] / bot_feat["bot_calls"]


In [8]:
human_feat = human.groupby("lead_code").agg(
    human_calls=("disposition", "count"),
    human_answered=("disposition", lambda x: (x=="ANSWERED").sum()),
    avg_human_duration=("duration", "mean")
).reset_index()

human_feat["human_answer_ratio"] = human_feat["human_answered"] / human_feat["human_calls"]


In [9]:
field_feat = field.groupby("lead_code").agg(
    field_visits=("visit_date", "count"),
    met_customer=("result", lambda x: (x=="MET_CUSTOMER").sum()),
    door_locked=("result", lambda x: (x=="DOOR_LOCKED").sum()),
    shifted=("result", lambda x: (x=="SHIFTED").sum())
).reset_index()

field_feat["met_ratio"] = field_feat["met_customer"] / field_feat["field_visits"]


In [10]:
sms_feat = sms.groupby("lead_code").agg(
    sms_sent=("status", "count"),
    sms_delivered=("status", lambda x: (x=="DELIVERED").sum())
).reset_index()

sms_feat["sms_delivery_ratio"] = sms_feat["sms_delivered"] / sms_feat["sms_sent"]


In [11]:
features = meta.merge(wa_feat, on="lead_code", how="left") \
               .merge(bot_feat, on="lead_code", how="left") \
               .merge(human_feat, on="lead_code", how="left") \
               .merge(field_feat, on="lead_code", how="left") \
               .merge(sms_feat, on="lead_code", how="left")

features.fillna(0, inplace=True)


In [12]:
train_df = train.merge(features, on="lead_code", how="left")
test_df  = test.merge(features, on="lead_code", how="left")

train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)


In [13]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ["state", "dpd_bucket", "suggested_action"]

for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col]  = le.transform(test_df[col])


In [14]:
X = train_df.drop(columns=["id", "TARGET"])
y = train_df["TARGET"]

X_test = test_df.drop(columns=["id"])


In [16]:
X.dtypes


Unnamed: 0,0
lead_code,object
suggested_action,int64
total_due,int64
dpd_bucket,int64
state,int64
wa_msgs_sent,float64
wa_delivered,float64
wa_read,float64
wa_replies,float64
wa_delivered_ratio,float64


In [17]:
drop_cols = ["id", "TARGET", "lead_code"]

# also drop agent_id if present
if "agent_id" in train_df.columns:
    drop_cols.append("agent_id")

X = train_df.drop(columns=drop_cols)
y = train_df["TARGET"]

X_test = test_df.drop(columns=[c for c in drop_cols if c != "TARGET"])


In [18]:
X.select_dtypes(include="object").columns


Index([], dtype='object')

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)


In [20]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation="relu", input_shape=(X.shape[1],)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["AUC"]
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [21]:
model.fit(
    X, y,
    validation_split=0.2,
    epochs=20,
    batch_size=256,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor="val_loss", patience=3, restore_best_weights=True
        )
    ]
)


Epoch 1/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - AUC: 0.6671 - loss: 0.6881 - val_AUC: 0.8564 - val_loss: 0.6746
Epoch 2/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - AUC: 0.8034 - loss: 0.6753 - val_AUC: 0.8764 - val_loss: 0.6716
Epoch 3/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - AUC: 0.8231 - loss: 0.6732 - val_AUC: 0.8742 - val_loss: 0.6710
Epoch 4/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - AUC: 0.8152 - loss: 0.6721 - val_AUC: 0.8756 - val_loss: 0.6704
Epoch 5/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - AUC: 0.8209 - loss: 0.6711 - val_AUC: 0.8684 - val_loss: 0.6702
Epoch 6/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - AUC: 0.8287 - loss: 0.6711 - val_AUC: 0.8707 - val_loss: 0.6698
Epoch 7/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - 

<keras.src.callbacks.history.History at 0x7d93c8f2df70>

In [22]:
preds = model.predict(X_test).flatten()
preds = preds.clip(0.001, 0.999)


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [24]:
submission = pd.DataFrame({
    "id": test_df["id"],
    "TARGET": preds
})

submission.to_csv("submission.csv", index=False)
