<a href="https://colab.research.google.com/github/Ahmedabdelwaly/Grad/blob/main/SslV3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import pickle
import ast

In [2]:
# Load your raw dataset
from google.colab import files
uploaded = files.upload()

Saving DonorCharityDataset_Balanced80.csv to DonorCharityDataset_Balanced80.csv


In [3]:
# Load dataset
df = pd.read_csv("DonorCharityDataset_Balanced80.csv")

# Clean text and donation types
df["action_type"] = df["action_type"].str.strip().str.lower().replace("click", "view")
df["urgency_level"] = df["urgency_level"].str.strip().str.capitalize()

def clean_types(raw):
    try:
        types = ast.literal_eval(raw)
        return [t.strip().lower().replace("school suplies", "school supplies") for t in types]
    except:
        return []

df["charity_preferred_types"] = df["charity_preferred_types"].apply(clean_types)


In [4]:
donation_types = ['food', 'clothes', 'toys', 'furniture', 'electronics', 'school supplies']
type_to_index = {t: i for i, t in enumerate(donation_types)}

def multi_hot(types_list):
    vec = [0] * len(donation_types)
    for t in types_list:
        if t in type_to_index:
            vec[type_to_index[t]] = 1
    return vec

df["preferred_types_multi"] = df["charity_preferred_types"].apply(multi_hot)


In [14]:
donation_types = ['food', 'clothes', 'toys', 'furniture', 'electronics', 'school supplies']
type_to_index = {t: i for i, t in enumerate(donation_types)}

def multi_hot(types_list):
    vec = [0] * len(donation_types)
    for t in types_list:
        if t in type_to_index:
            vec[type_to_index[t]] = 1
    return vec

df["preferred_types_multi"] = df["charity_preferred_types"].apply(multi_hot)

In [15]:
# Load encoders if exist or create them
try:
    with open("action_encoder.pkl", "rb") as f:
        action_encoder = pickle.load(f)
    with open("urgency_encoder.pkl", "rb") as f:
        urgency_encoder = pickle.load(f)
except FileNotFoundError:
    action_encoder = LabelEncoder()
    urgency_encoder = LabelEncoder()
    df["action_encoded"] = action_encoder.fit_transform(df["action_type"])
    df["urgency_encoded"] = urgency_encoder.fit_transform(df["urgency_level"])
    with open("action_encoder.pkl", "wb") as f:
        pickle.dump(action_encoder, f)
    with open("urgency_encoder.pkl", "wb") as f:
        pickle.dump(urgency_encoder, f)
else:
    df["action_encoded"] = action_encoder.transform(df["action_type"])
    df["urgency_encoded"] = urgency_encoder.transform(df["urgency_level"])

In [None]:
import pickle

donation_types = ['food', 'clothes', 'toys', 'furniture', 'electronics', 'school supplies']

with open("donation_types.pkl", "wb") as f:
    pickle.dump(donation_types, f)

In [16]:
MAX_SEQ_LEN = 30

df = df.sort_values(by="donor_id")

donor_sessions = {}
for donor_id, group in df.groupby("donor_id"):
    group = group.tail(MAX_SEQ_LEN)
    donor_sessions[donor_id] = {
        "action_seq": group["action_encoded"].tolist(),
        "urgency_seq": group["urgency_encoded"].tolist(),
        "preferred_seq": group["preferred_types_multi"].tolist()
    }

# Padding helpers
def pad_seq(seq, pad_val=0):
    return [pad_val] * (MAX_SEQ_LEN - len(seq)) + seq if len(seq) < MAX_SEQ_LEN else seq[-MAX_SEQ_LEN:]

def pad_multihot(seq):
    zero = [0] * len(donation_types)
    return [zero] * (MAX_SEQ_LEN - len(seq)) + seq if len(seq) < MAX_SEQ_LEN else seq[-MAX_SEQ_LEN:]

In [17]:
class DonorCharityPairDataset(Dataset):
    def __init__(self, df, donor_sessions):
        self.samples = []
        for _, row in df.iterrows():
            donor_id = row["donor_id"]
            if donor_id not in donor_sessions: continue
            label = 1 if row["action_type"] == "donate" else 0
            self.samples.append({
                "donor_id": donor_id,
                "charity_vec": row["preferred_types_multi"],
                "label": label
            })
        self.donor_sessions = donor_sessions

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        s = self.samples[idx]
        session = self.donor_sessions[s["donor_id"]]
        return {
            "action_seq": torch.tensor(pad_seq(session["action_seq"]), dtype=torch.long),
            "urgency_seq": torch.tensor(pad_seq(session["urgency_seq"]), dtype=torch.long),
            "preferred_seq": torch.tensor(pad_multihot(session["preferred_seq"]), dtype=torch.float),
            "charity_vec": torch.tensor(s["charity_vec"], dtype=torch.float),
            "label": torch.tensor(s["label"], dtype=torch.float)
        }

dataset = DonorCharityPairDataset(df, donor_sessions)
loader = DataLoader(dataset, batch_size=64, shuffle=True)


In [18]:
class SSLTransformer(nn.Module):
    def __init__(self, num_actions, num_urgencies, embedding_dim=128, max_seq_len=30, n_heads=4, n_layers=2, dropout=0.1):
        super().__init__()
        self.action_embedding = nn.Embedding(num_actions, embedding_dim)
        self.urgency_embedding = nn.Embedding(num_urgencies, embedding_dim)
        self.preferred_dense = nn.Linear(6, embedding_dim)
        self.position_embedding = nn.Embedding(max_seq_len, embedding_dim)

        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=n_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

    def forward(self, action_seq, urgency_seq, preferred_seq):
        B, T = action_seq.shape

        action_emb = self.action_embedding(action_seq)
        urgency_emb = self.urgency_embedding(urgency_seq)
        preferred_emb = self.preferred_dense(preferred_seq)
        pos_ids = torch.arange(T, device=action_seq.device).unsqueeze(0).expand(B, T)
        pos_emb = self.position_embedding(pos_ids)

        x = action_emb + urgency_emb + preferred_emb + pos_emb
        x = self.transformer_encoder(x)
        return x[:, -1, :]  # Last timestep = donor vector


In [19]:
class CharityEncoder(nn.Module):
    def __init__(self, input_dim=6, embedding_dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, embedding_dim)
        )

    def forward(self, x):
        return self.net(x)


In [20]:
class MatchScorer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, donor_vecs, charity_vecs):
        return (donor_vecs * charity_vecs).sum(dim=1)  # (B,)


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_actions = len(action_encoder.classes_)
num_urgencies = len(urgency_encoder.classes_)

donor_encoder = SSLTransformer(num_actions, num_urgencies).to(device)
charity_encoder = CharityEncoder().to(device)
scorer = MatchScorer().to(device)

optimizer = torch.optim.Adam(list(donor_encoder.parameters()) +
                             list(charity_encoder.parameters()) +
                             list(scorer.parameters()), lr=0.001)

criterion = nn.BCEWithLogitsLoss()




In [22]:
EPOCHS = 10

for epoch in range(EPOCHS):
    donor_encoder.train()
    charity_encoder.train()
    scorer.train()

    total_loss = 0

    for batch in loader:
        for k in batch:
            batch[k] = batch[k].to(device)

        donor_vec = donor_encoder(batch["action_seq"], batch["urgency_seq"], batch["preferred_seq"])
        charity_vec = charity_encoder(batch["charity_vec"])
        scores = scorer(donor_vec, charity_vec)

        loss = criterion(scores, batch["label"])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{EPOCHS} — Loss: {avg_loss:.4f}")


Epoch 1/10 — Loss: 0.6946
Epoch 2/10 — Loss: 0.6344
Epoch 3/10 — Loss: 0.6280
Epoch 4/10 — Loss: 0.6224
Epoch 5/10 — Loss: 0.6197
Epoch 6/10 — Loss: 0.6171
Epoch 7/10 — Loss: 0.6106
Epoch 8/10 — Loss: 0.6112
Epoch 9/10 — Loss: 0.6084
Epoch 10/10 — Loss: 0.6062


In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

def evaluate_model(donor_encoder, charity_encoder, scorer, loader):
    donor_encoder.eval()
    charity_encoder.eval()
    scorer.eval()

    y_true = []
    y_pred = []

    with torch.no_grad():
        for batch in loader:
            for k in batch:
                batch[k] = batch[k].to(device)

            donor_vec = donor_encoder(batch["action_seq"], batch["urgency_seq"], batch["preferred_seq"])
            charity_vec = charity_encoder(batch["charity_vec"])
            scores = scorer(donor_vec, charity_vec)

            preds = torch.sigmoid(scores).cpu().numpy()
            labels = batch["label"].cpu().numpy()

            y_pred.extend(preds)
            y_true.extend(labels)

    y_pred_binary = [1 if p >= 0.5 else 0 for p in y_pred]

    acc = accuracy_score(y_true, y_pred_binary)
    prec = precision_score(y_true, y_pred_binary)
    rec = recall_score(y_true, y_pred_binary)
    auc = roc_auc_score(y_true, y_pred)

    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, AUC: {auc:.4f}")


In [28]:
evaluate_model(donor_encoder, charity_encoder, scorer, loader)


Accuracy: 0.6911, Precision: 0.6198, Recall: 0.3476, AUC: 0.6919


In [24]:
def recommend_real_charities(donor_id, session, donor_encoder, charity_encoder, live_charities, top_k=5):
    """
    donor_id: str
    session: dict → {"action_seq": [...], "urgency_seq": [...], "preferred_seq": [[...], [...]]}
    live_charities: list of dicts → [{"id": ..., "preferred_types": [...], "city": ...}]
    """

    action_seq = torch.tensor(pad_seq(session["action_seq"]), dtype=torch.long).unsqueeze(0).to(device)
    urgency_seq = torch.tensor(pad_seq(session["urgency_seq"]), dtype=torch.long).unsqueeze(0).to(device)
    preferred_seq = torch.tensor(pad_multihot(session["preferred_seq"]), dtype=torch.float).unsqueeze(0).to(device)

    with torch.no_grad():
        donor_vec = donor_encoder(action_seq, urgency_seq, preferred_seq)  # (1, 128)

        # Build charity batch
        charity_vecs = []
        metadata = []

        for charity in live_charities:
            mh_vec = multi_hot(charity["preferred_types"])  # already defined earlier
            charity_vecs.append(mh_vec)
            metadata.append({
                "id": charity["id"],
                "city": charity.get("city", "Unknown"),
                "types": charity["preferred_types"]
            })

        charity_vecs = torch.tensor(charity_vecs, dtype=torch.float).to(device)
        embedded = charity_encoder(charity_vecs)  # (N, 128)

        sim_scores = torch.matmul(donor_vec, embedded.T).squeeze()  # (N,)
        topk = torch.topk(sim_scores, k=top_k)

        results = []
        for idx in topk.indices.tolist():
            results.append({
                "charity_id": metadata[idx]["id"],
                "city": metadata[idx]["city"],
                "types": metadata[idx]["types"],
                "score": sim_scores[idx].item()
            })

    return results

In [None]:
import torch
import numpy as np
import pandas as pd

# Simulated 6 donation types
donation_types = ['food', 'clothes', 'toys', 'furniture', 'electronics', 'school supplies']
type_to_index = {t: i for i, t in enumerate(donation_types)}

def multi_hot(type_list):
    vec = [0] * len(donation_types)
    for t in type_list:
        if t in type_to_index:
            vec[type_to_index[t]] = 1
    return vec

# Pad sequences
MAX_LEN = 30
def pad(seq, val=0): return [val]*(MAX_LEN-len(seq)) + seq if len(seq) < MAX_LEN else seq[-MAX_LEN:]
def pad_mh(seq): return [[0]*6]*(MAX_LEN-len(seq)) + seq if len(seq) < MAX_LEN else seq[-MAX_LEN:]

# Simulated donor session history
donor_sessions = {
    "D1": {
        "action_seq": [2, 2, 0],  # donate, donate, view
        "urgency_seq": [1, 2, 1],
        "preferred_seq": [multi_hot(["food", "clothes"]), multi_hot(["food", "clothes"]), multi_hot(["clothes"])]
    },
    "D2": {
        "action_seq": [2, 0, 0],
        "urgency_seq": [1, 1, 0],
        "preferred_seq": [multi_hot(["food"]), multi_hot(["clothes"]), multi_hot(["food", "clothes"])]
    },
    "D3": {
        "action_seq": [1, 1, 0],
        "urgency_seq": [0, 0, 1],
        "preferred_seq": [multi_hot(["electronics"]), multi_hot(["furniture"]), multi_hot(["school supplies"])]
    }
}

# Real charities
charities = [
    {"id": f"C{i+1}", "preferred_types": t}
    for i, t in enumerate([
        ["food", "clothes"], ["electronics"], ["furniture"], ["toys"], ["school supplies"],
        ["food"], ["clothes"], ["furniture", "electronics"], ["toys", "school supplies"], ["food", "toys"]
    ])
]

# Create multi-hot vectors
charity_vectors = torch.tensor([multi_hot(c["preferred_types"]) for c in charities], dtype=torch.float)

# Dummy encoders: Simulate learned embeddings
torch.manual_seed(0)
donor_encoder = torch.nn.Linear(128, 128)
charity_encoder = torch.nn.Linear(6, 128)

# Compute similarity score
def score(donor_vec, charity_vecs):
    return torch.matmul(donor_vec, charity_vecs.T)

# Encode donor history
def encode_donor(donor_id):
    s = donor_sessions[donor_id]
    a = torch.tensor(pad(s["action_seq"])).unsqueeze(0)
    u = torch.tensor(pad(s["urgency_seq"])).unsqueeze(0)
    p = torch.tensor(pad_mh(s["preferred_seq"]), dtype=torch.float).unsqueeze(0)
    return donor_encoder(p.mean(dim=1))  # using mean as placeholder encoder input

# Recommend top 5 charities
top5_results = {}
for donor_id in donor_sessions:
    donor_vec = encode_donor(donor_id)
    embedded_charities = charity_encoder(charity_vectors)
    sim_scores = score(donor_vec, embedded_charities).squeeze()
    top_indices = torch.topk(sim_scores, k=5).indices.tolist()
    top5 = [(charities[i]["id"], charities[i]["preferred_types"], round(sim_scores[i].item(), 4)) for i in top_indices]
    top5_results[donor_id] = top5

# Display
rows = []
for donor_id, recs in top5_results.items():
    for rank, (cid, types, score) in enumerate(recs, 1):
        rows.append({
            "Donor": donor_id,
            "Rank": rank,
            "Charity ID": cid,
            "Preferred Types": ", ".join(types),
            "Score": score
        })

df = pd.DataFrame(rows)
print(df)
