In [1]:
import sklearn

In [None]:
import joblib
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, MultiLabelBinarizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout



In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
print(torch.__version__)

2.6.0


In [5]:
user = pd.read_csv('data/users.csv')
print("Columns in the CSV file:", user.columns.tolist())

Columns in the CSV file: ['user_id', 'name', 'gender', 'preferred_modality', 'preferred_gender', 'preferred_language', 'preferred_days', 'preferred_mode', 'preferred_specialties', 'preferred_therapist_id']


In [6]:
therapist = pd.read_csv('data/therapists.csv')
print("Columns in the CSV file:", therapist.columns.tolist())

Columns in the CSV file: ['id', 'therapist_name', 'gender', 'modality', 'language', 'available_days', 'mode', 'experience_years', 'specialties']


In [7]:
class MultiLabelBinarizerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y=None):
        self.mlb.fit(X)
        return self

    def transform(self, X):
        return self.mlb.transform(X)

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [13]:
# Load the data
users_df = pd.read_csv('data/users.csv')
therapists_df = pd.read_csv('data/therapists.csv')

In [16]:
# Create feature vectors for user-therapist pairs
def create_features(user, therapist):
    features = []
    
    # Feature 1: Match on modality
    features.append(1 if user['preferred_modality'] == therapist['modality'] else 0)
    
    # Feature 2: Match on gender
    features.append(1 if user['preferred_gender'] == therapist['gender'] else 0)
    
    # Feature 3: Match on language
    features.append(1 if user['preferred_language'] == therapist['language'] else 0)
    
    # Feature 4: Match on mode
    features.append(1 if user['preferred_mode'] == therapist['mode'] else 0)
    
    # Feature 5: Overlap in days (at least one day in common)
    user_days = set(user['preferred_days'].split(',')) if pd.notna(user['preferred_days']) else set()
    therapist_days = set(therapist['available_days'].split(',')) if pd.notna(therapist['available_days']) else set()
    features.append(1 if user_days.intersection(therapist_days) else 0)
    
    # Feature 6: Overlap in specialties (count of overlapping specialties)
    user_specialties = set(user['preferred_specialties'].split(',')) if pd.notna(user['preferred_specialties']) else set()
    therapist_specialties = set(therapist['specialties'].split(',')) if pd.notna(therapist['specialties']) else set()
    features.append(len(user_specialties.intersection(therapist_specialties)))
    
    # Feature 7: Therapist experience years
    features.append(therapist['experience_years'] if pd.notna(therapist['experience_years']) else 0)
    
    return np.array(features)

# Create training data using preferred_therapist_id as the label
X_train = []
y_train = []

for _, user in users_df.iterrows():
    preferred_therapist_id = user['preferred_therapist_id']
    if pd.isna(preferred_therapist_id):
        continue  # Skip users with no preferred therapist
    
    for _, therapist in therapists_df.iterrows():
        features = create_features(user, therapist)
        X_train.append(features)
        # Label: 1 if this therapist is the preferred one, 0 otherwise
        y_train.append(1 if therapist['id'] == preferred_therapist_id else 0)

X_train = np.array(X_train)
y_train = np.array(y_train)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train logistic regression
logreg = LogisticRegression(random_state=42, class_weight='balanced')  # Handle imbalanced data
logreg.fit(X_train_scaled, y_train)

# Print the learned weights
feature_names = [
    "modality_match", "gender_match", "language_match", "mode_match",
    "days_overlap", "specialties_overlap", "experience_years"
]
weights = logreg.coef_[0]
print("Learned weights for each feature:")
for name, weight in zip(feature_names, weights):
    print(f"{name}: {weight:.4f}")
print(f"Intercept: {logreg.intercept_[0]:.4f}")

# Save the scaler and model
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(logreg, 'logreg_model.pkl')

print("Logistic regression model trained and saved.")

Learned weights for each feature:
modality_match: 0.7118
gender_match: 0.4513
language_match: -0.0339
mode_match: -0.1996
days_overlap: 0.4463
specialties_overlap: 0.6322
experience_years: 0.1413
Intercept: -1.1499
Logistic regression model trained and saved.


In [None]:

# Convert logistic regression to PyTorch for RL updates
class LogisticRegressionTorch(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionTorch, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# Initialize PyTorch model with weights from sklearn
input_dim = X_train.shape[1]
model = LogisticRegressionTorch(input_dim)

# Copy weights from sklearn model
with torch.no_grad():
    model.linear.weight.data = torch.tensor(logreg.coef_, dtype=torch.float32)
    model.linear.bias.data = torch.tensor(logreg.intercept_, dtype=torch.float32)

# Define optimizer for RL updates
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Simulate user feedback (replace with real feedback in practice)
def simulate_feedback(user_id, therapist_id):
    # Placeholder: simulate feedback (+1 for good match, -1 for bad)
    # In practice, replace with real user feedback (e.g., ratings)
    return np.random.choice([1, -1])

# Step 6: RL training loop (policy gradient) - Fixed version
def rl_update(model, scaler, users_df, therapists_df, num_episodes=100):
    model.train()  # Set model to training mode to ensure gradients are tracked
    for episode in range(num_episodes):
        total_reward = 0
        log_probs = []
        rewards = []
        
        for _, user in users_df.sample(frac=0.1).iterrows():
            scores = []
            therapist_ids = []
            
            # Compute scores for all therapists
            for _, therapist in therapists_df.iterrows():
                features = create_features(user, therapist)
                features_scaled = scaler.transform([features])[0]
                features_tensor = torch.tensor(features_scaled, dtype=torch.float32, requires_grad=True)
                prob = model(features_tensor)  # Ensure prob is part of the computation graph
                scores.append(prob)  # Keep as tensor, don't convert to item yet
                therapist_ids.append(therapist['id'])
            
            # Stack scores into a tensor
            scores_tensor = torch.stack(scores).squeeze()  # Shape: [num_therapists]
            
            # Apply softmax to get probabilities
            probs = torch.softmax(scores_tensor, dim=0)  # Shape: [num_therapists]
            
            # Sample an action (therapist) using Categorical distribution
            dist = torch.distributions.Categorical(probs=probs)
            action = dist.sample()  # Sample an action index
            log_prob = dist.log_prob(action)  # Compute log probability of the action
            
            # Get reward (simulated feedback)
            therapist_id = therapist_ids[action.item()]
            reward = simulate_feedback(user['user_id'], therapist_id)
            
            log_probs.append(log_prob)
            rewards.append(reward)
            total_reward += reward
        
        # Compute policy loss
        policy_loss = []
        for log_prob, reward in zip(log_probs, rewards):
            # Ensure reward is a scalar float or tensor, not a list or array
            reward_tensor = torch.tensor(float(reward), requires_grad=False)
            policy_loss.append(-log_prob * reward_tensor)
        
        # Stack and compute mean loss
        policy_loss = torch.stack(policy_loss).mean()
        
        # Backpropagate and update model
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        print(f"Episode {episode + 1}, Total Reward: {total_reward}")

# Run RL updates
rl_update(model, scaler, users_df, therapists_df)

# Save the updated model
torch.save(model.state_dict(), 'logreg_rl_model.pth')
print("Updated model saved.")

Episode 1, Total Reward: -4
Episode 2, Total Reward: -2
Episode 3, Total Reward: 2
Episode 4, Total Reward: -6
Episode 5, Total Reward: -2
Episode 6, Total Reward: 4
Episode 7, Total Reward: 4
Episode 8, Total Reward: 0
Episode 9, Total Reward: 0
Episode 10, Total Reward: 2
Episode 11, Total Reward: -2
Episode 12, Total Reward: -4
Episode 13, Total Reward: -8
Episode 14, Total Reward: 4
Episode 15, Total Reward: 2
Episode 16, Total Reward: 2
Episode 17, Total Reward: 0
Episode 18, Total Reward: -4
Episode 19, Total Reward: -4
Episode 20, Total Reward: 4
Episode 21, Total Reward: 0
Episode 22, Total Reward: 0
Episode 23, Total Reward: 2
Episode 24, Total Reward: -2
Episode 25, Total Reward: 0
Episode 26, Total Reward: 0
Episode 27, Total Reward: 0
Episode 28, Total Reward: 2
Episode 29, Total Reward: 2
Episode 30, Total Reward: 0
Episode 31, Total Reward: 2
Episode 32, Total Reward: -4
Episode 33, Total Reward: 6
Episode 34, Total Reward: 4
Episode 35, Total Reward: 2
Episode 36, Total 

In [12]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import joblib

# Step 1: Define the LogisticRegressionTorch class (same as during training)
class LogisticRegressionTorch(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionTorch, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# Step 2: Load the scaler and model
scaler = joblib.load('scaler.pkl')
input_dim = 7  # Match the input_dim from training (number of features)
model = LogisticRegressionTorch(input_dim)
model.load_state_dict(torch.load('logreg_rl_model.pth'))
model.eval()

# Step 3: Load therapist data
try:
    therapists_df = pd.read_csv('therapist.csv')
except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure 'therapists.csv' is in the correct directory.")
    exit(1)

# Step 4: Define the create_features function (same as during training)
def create_features(user, therapist):
    features = []
    features.append(1 if user['preferred_modality'] == therapist['modality'] else 0)
    features.append(1 if user['preferred_gender'] == therapist['gender'] else 0)
    features.append(1 if user['preferred_language'] == therapist['language'] else 0)
    features.append(1 if user['preferred_mode'] == therapist['mode'] else 0)
    user_days = set(user['preferred_days'].split(',')) if pd.notna(user['preferred_days']) else set()
    therapist_days = set(therapist['available_days'].split(',')) if pd.notna(therapist['available_days']) else set()
    features.append(1 if user_days.intersection(therapist_days) else 0)
    user_specialties = set(user['preferred_specialties'].split(',')) if pd.notna(user['preferred_specialties']) else set()
    therapist_specialties = set(therapist['specialties'].split(',')) if pd.notna(therapist['specialties']) else set()
    features.append(len(user_specialties.intersection(therapist_specialties)))
    features.append(therapist['experience_years'] if pd.notna(therapist['experience_years']) else 0)
    return np.array(features)

# Step 5: Define a function to suggest top 5 therapists for a user
def suggest_top_5_therapists(user_data, therapists_df, scaler, model):
    scores = []
    therapist_ids = []
    
    # Compute scores for all therapists
    for _, therapist in therapists_df.iterrows():
        features = create_features(user_data, therapist)
        features_scaled = scaler.transform([features])[0]
        features_tensor = torch.tensor(features_scaled, dtype=torch.float32)
        with torch.no_grad():
            prob = model(features_tensor).item()
        scores.append(prob)
        therapist_ids.append(therapist['id'])
    
    # Get top 5 therapists
    top_indices = np.argsort(scores)[-5:][::-1]  # Sort in descending order and take top 5
    top_therapists = [
        {
            "therapist_id": therapist_ids[i],
            "therapist_name": therapists_df.loc[therapists_df['id'] == therapist_ids[i], 'therapist_name'].iloc[0],
            "score": scores[i]
        }
        for i in top_indices
    ]
    
    return top_therapists

# Step 6: Example usage
# Option 1: Provide user data manually as a dictionary
user_data_manual = pd.Series({
    "preferred_modality": "CBT",
    "preferred_gender": "female",
    "preferred_language": "English",
    "preferred_days": "Monday,Wednesday",
    "preferred_mode": "in-person",
    "preferred_specialties": "anxiety,depression",
    "age": 30  # Not used in features but included for completeness
})

top_5 = suggest_top_5_therapists(user_data_manual, therapists_df, scaler, model)
print("\nTop 5 therapists for manually provided user:")
for therapist in top_5:
    print(f"Therapist ID: {therapist['therapist_id']}, Name: {therapist['therapist_name']}, Score: {therapist['score']:.4f}")

# Option 2: Pick a user from users.csv for demonstration
try:
    users_df = pd.read_csv('data/users.csv')
    if not users_df.empty:
        # Pick the first user for demonstration (you can change the index or user_id)
        sample_user = users_df.iloc[0]
        print(f"\nSuggesting top 5 therapists for user: {sample_user['name']} (User ID: {sample_user['user_id']})")
        top_5 = suggest_top_5_therapists(sample_user, therapists_df, scaler, model)
        for therapist in top_5:
            print(f"Therapist ID: {therapist['therapist_id']}, Name: {therapist['therapist_name']}, Score: {therapist['score']:.4f}")
    else:
        print("No users found in user.csv.")
except FileNotFoundError as e:
    print(f"Error: {e}. Could not load user.csv for demonstration.")

Error: [Errno 2] No such file or directory: 'therapist.csv'. Please ensure 'therapists.csv' is in the correct directory.

Top 5 therapists for manually provided user:
Therapist ID: 200, Name: Dr. Betelhem Alemu, Score: 0.1045
Therapist ID: 262, Name: Dr. Fikadu Alemu, Score: 0.1045
Therapist ID: 195, Name: Dr. Arsema Tsegaye, Score: 0.1045
Therapist ID: 21, Name: Dr. Bereket Ke/bede, Score: 0.1045
Therapist ID: 96, Name: Dr. Worku Abdullahi, Score: 0.1045

Suggesting top 5 therapists for user: Chala Teshome (User ID: 5)
Therapist ID: 154, Name: Dr. Saba Zewdu, Score: 0.9815
Therapist ID: 238, Name: Dr. Gashaw Tadesse, Score: 0.9773
Therapist ID: 370, Name: Dr. Gashaw Desta, Score: 0.9599
Therapist ID: 206, Name: Dr. Gebeyaw Alemu, Score: 0.9520
Therapist ID: 143, Name: Dr. Kebede Belay, Score: 0.9484
