In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
data = pd.read_csv('2021_placements.csv')

In [3]:
data

Unnamed: 0,Hospital,Posts,Applications
0,Addington Hospital,72,78
1,Boitumelo Hospital,16,53
2,Bongani Hospital,18,43
3,Charlotte Maxeke Hospital,42,58
4,Chris Hani Baragwanath Hospital,90,123
...,...,...,...
60,Tshilidzine Hospital,16,31
61,Tygerberg Hospital,72,119
62,Victoria Hospital,18,71
63,Witbank Hospital,30,424


In [4]:
data['probability'] = data.Applications/data.Applications.sum()

In [5]:
data

Unnamed: 0,Hospital,Posts,Applications,probability
0,Addington Hospital,72,78,0.006100
1,Boitumelo Hospital,16,53,0.004145
2,Bongani Hospital,18,43,0.003363
3,Charlotte Maxeke Hospital,42,58,0.004536
4,Chris Hani Baragwanath Hospital,90,123,0.009619
...,...,...,...,...
60,Tshilidzine Hospital,16,31,0.002424
61,Tygerberg Hospital,72,119,0.009306
62,Victoria Hospital,18,71,0.005553
63,Witbank Hospital,30,424,0.033159


In [6]:
total_emails = data.Posts.sum()
emails = []

for i in range(total_emails):
    emails.append(f'user{i}@example.com')
    

In [7]:
placements = []

for i in data.index:
    posts = data.Posts[i]
    hospital = data.Hospital[i]
    placements.extend([hospital] * posts)

In [8]:
df_data ={"email": emails,
          'current_placement': placements}

current_placements_df = pd.DataFrame(df_data)
current_placements_df

Unnamed: 0,email,current_placement
0,user0@example.com,Addington Hospital
1,user1@example.com,Addington Hospital
2,user2@example.com,Addington Hospital
3,user3@example.com,Addington Hospital
4,user4@example.com,Addington Hospital
...,...,...
2076,user2076@example.com,Worcester Hospital
2077,user2077@example.com,Worcester Hospital
2078,user2078@example.com,Worcester Hospital
2079,user2079@example.com,Worcester Hospital


In [11]:
def select_random_hospitals(df, current, k=3):
    # Ensure the DataFrame has the expected columns
    if 'Hospital' not in df.columns or 'probability' not in df.columns:
        raise ValueError("DataFrame must contain 'Hospital' and 'probability' columns.")

    # Normalize the probabilities to sum up to 1
    df['probability'] = df['probability'] / df['probability'].sum()

    # Use random.choices to select k hospitals based on probabilities
    selected_hospitals = random.choices(df['Hospital'], df['probability'], k=k)

    # Check for duplicates or if current is in selected_hospitals
    if len(set(selected_hospitals)) != k or current in selected_hospitals:
        return select_random_hospitals(df, current, k)  # Return the result of the recursive call

    return selected_hospitals

In [12]:
for index, row in current_placements_df.iterrows():
    current =  current_placements_df.at[index, 'current_placement']
    
    choices = select_random_hospitals(data.iloc[:, [0, 3]], current)
    
    # Check if choices has at least three elements
    if len(choices) >= 3:
        current_placements_df.at[index, 'first_choice'] = choices[0]
        current_placements_df.at[index, 'second_choice'] = choices[1]
        current_placements_df.at[index, 'third_choice'] = choices[2]
    else:
        # Handle the case where select_random_hospitals returns fewer than three choices
        print(f"Error: Insufficient choices for row {index}.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['probability'] = df['probability'] / df['probability'].sum()


In [13]:
current_placements_df.head(37)

Unnamed: 0,email,current_placement,first_choice,second_choice,third_choice
0,user0@example.com,Addington Hospital,Dihlabeng Hospital,Livingstone Hospital,Klerksdorp-Tshepong Tertiary Hospital
1,user1@example.com,Addington Hospital,Witbank Hospital,Frere Hospital,New Somerset Hospital
2,user2@example.com,Addington Hospital,Robert Mangaliso Sobukwe Hospital,Paarl Hospital,Dihlabeng Hospital
3,user3@example.com,Addington Hospital,Leratong Hospital,"Ngwelezana Hospital, Empangeni",General Justice Gizenga Mpanza Hospital
4,user4@example.com,Addington Hospital,Witbank Hospital,Klerksdorp-Tshepong Tertiary Hospital,Worcester Hospital
5,user5@example.com,Addington Hospital,Frere Hospital,Sebokeng Hospital,Potchefstroom Hospital
6,user6@example.com,Addington Hospital,General Justice Gizenga Mpanza Hospital,Kalafong Hospital,Karl Bremer Hospital
7,user7@example.com,Addington Hospital,"Ngwelezana Hospital, Empangeni",Lebowakgomo Hospital,Frere Hospital
8,user8@example.com,Addington Hospital,Klerksdorp-Tshepong Tertiary Hospital,Livingstone Hospital,Paarl Hospital
9,user9@example.com,Addington Hospital,Cofimvaba Hospital,Frere Hospital,Sebokeng Hospital


In [14]:

def find_duplicate_indices(df):
    # Initialize an empty list to store the indices of rows with duplicates
    duplicate_indices = []

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        # Convert the row values in the specified columns to a set to identify duplicates
        choices = set(row[['current_placement', 'first_choice', 'second_choice', 'third_choice']])
        
        # Check if the length of the set is less than the number of choices
        # This indicates that there are duplicate values in the choices
        if len(choices) < 4:
            duplicate_indices.append(index)
    
    return duplicate_indices


In [15]:
# Find and print the indices of rows with duplicate values
duplicate_indices = find_duplicate_indices(current_placements_df)
print(len(duplicate_indices))

0


In [18]:
current_placements_df.to_csv("simulated_placements.csv", index=False)