In [1]:
import pandas as pd
import model
import openpyxl

In [2]:
ls

Recommendation Form (Yanıtlar) - Form Yanıtları 1.csv
[34m__pycache__[m[m/
data-mentors.csv
data-pairings.csv
data-students.csv
main.ipynb
model.py
reference scores.ipynb


In [3]:
# Read data

data_mentor = pd.read_csv('data-mentors.csv', encoding='utf-8')
data_student = pd.read_csv('data-students.csv', encoding='utf-8')
data_pairing = pd.read_csv('data-pairings.csv', encoding = 'utf-8')

Required changes in the data files:

1. Both in data_student and data_mentor; Simplify the column names to only subject names by removing the questions 
These are the 6th-36th indexed columns in data_student, 5-35 in data_mentor

2. In data_mentor; quantify mentor's position. Master:1, PhD Student:2, Postdoctoral Researcher:3

In [4]:
# 1. Remove the questions in the column names in data_student and data_mentor

for i in range(5,35):
    data_student.rename(columns = {data_student.columns[i+1]: data_student.columns[i+1][80:-1]},inplace=True)
    data_mentor.rename(columns = {data_mentor.columns[i]: data_mentor.columns[i][66:-1]},inplace=True)

In [5]:
# 2. Introduce a new column 'Position_numerical' that quantifies mentor's positions

def quantify_position(pos):
    if pos == 'Master':
        return 1
    if pos == 'PhD student':
        return 2
    if pos == 'Postdoctoral researcher':
        return 3

data_mentor.insert(loc = 4, column = 'Position_numeric', value = data_mentor['Position'].apply(quantify_position))

In [6]:
# See the first few rows of data_mentor

data_mentor.head()

Unnamed: 0,Name,Gender,Affiliation,Position,Position_numeric,Areas of interest:,Set Theory/Model Theory,Category Theory,Real Analysis/Functional Analysis,Complex Analysis,...,Statistics,Computer Algebra,Data Science,Cryptography/Coding theory,Mathematical Modeling,Classical Mechanics,Complex Systems/Dynamical Systems,Quantum Theory,Relativity and Quantum Relativistic Theories,Statistical Mechanics
0,Mentor1,Male,The University of Alabama,Postdoctoral researcher,3,PDEs,,,Second Choice,,...,,,Fourth Choice,,,,,,,
1,Mentor2,Male,Middle East Technical University METU,Master,1,Algebraic and Differential Geometry,,,,,...,,,,,,,,,,
2,Mentor3,Male,Illinois Institute of Technology,PhD student,2,"Extremal Graph Theory, Optimization",,,,,...,,,,,First Choice,,,,,
3,Mentor4,Male,Higher School of Economics National Research U...,Master,1,"Plane Arrangements, Clifford Algebras, Algebra...",,,,Third Choice,...,,,,,,,,,,
4,Mentor5,Male,İstanbul teknik üniversitesi,PhD student,2,Data science/coding theory,,,,,...,,Second Choice,First Choice,,,,,,,


In [27]:
# Remove first years students

# data_student.drop(data_student[data_student['Class'] == '1'].index, inplace = True)



Required sets for the model

1. List of all universities
2. List of subjects
3. A dictionary for universites - key: university name, value: list of students from that university
4. Two dictionaries for interest areas (one for students, one for mentors) - key: student name, value: a list of his/her ordered choices
5. A dictionary for the score of assigning mentee i to mentor j - key: pairs (student, mentee), value: an integer score of matching a mentor with a mentee


Note: If a mentee (or mentor) has only one interest, say only choice 1, then leave his/her choices 2 and 3 empty, and assign a score to him/her by using only choice 1. Since some mentors have a fourth choice, all mentees will get zero for their fourth choices.

In [7]:
# The required set (1)

student_uni = data_student['University'].unique()    

# The required set (2)

subjects = [col for col in data_student.columns[6:36]]  

# The required set (3)

# data_student['University'].value_counts()

uni_students = data_student.groupby('University')['Name'].apply(list).to_dict()

In [8]:
# The required set (4)

choice_mentee = {}  
choice_mentor = {}
wrong_mentee=[]
wrong_mentor = []

for index,row in data_student.iterrows():
    choice_mentee[row['Name']] = {'First Choice':0, 'Second Choice':0, 'Third Choice':0}
    for subject in subjects:
        if pd.isnull(row[subject]) == False:
            interest_list = str(row[subject]).split(',')
          #  print(interest_list)

            for word in interest_list:
                if word in ['First Choice', 'Second Choice', 'Third Choice']:
                    choice_mentee[row['Name']][word] = subject
                else:   
                    wrong_mentee.append(word)
                   # print(row)


for index,row in data_mentor.iterrows():
    choice_mentor[row['Name']] = {'First Choice':0, 'Second Choice':0, 'Third Choice':0, 'Fourth Choice':0}
    for subject in subjects:
        if pd.isnull(row[subject]) == False:
            interest_list = str(row[subject]).split(',')

            for word in interest_list:
                if word in ['First Choice', 'Second Choice', 'Third Choice', 'Fourth Choice']:
                    choice_mentor[row['Name']][word] = subject
                else:   
                    wrong_mentor.append(word)
                   # print(row)
                 

In [9]:
# The required set (5)

weights = {'First Choice':8, 'Second Choice':6, 'Third Choice':4, 'Fourth Choice':2}

# Define score function

def score(student, mentor):
    score = 0
    for choice1 in choice_mentee[student].keys():
            for choice2 in choice_mentor[mentor].keys():
                if choice_mentee[student][choice1] == choice_mentor[mentor][choice2]:
                    score += weights[choice1]*weights[choice2]
    return score

# Define the scores dictionary

scores = {(mentee, mentor):0 for mentor in choice_mentor.keys() for mentee in choice_mentee.keys()}

for student in choice_mentee.keys():
    for mentor in choice_mentor.keys():
        scores[(student,mentor)] = score(student,mentor)

In [10]:
max_student = 1
uni_capacity = 10
time = 300

obj_value, solution, df = model.matching(choice_mentee, choice_mentor, uni_students, scores, max_student, uni_capacity, time)
print(obj_value)
df

3900.0


Unnamed: 0,Mentor,Student,Score
0,Mentor1,Student18,88
1,Mentor2,Student69,48
2,Mentor3,Student42,48
3,Mentor4,Student26,64
4,Mentor5,Student38,112
5,Mentor6,Student9,96
6,Mentor7,Student35,88
7,Mentor8,Student85,64
8,Mentor9,Student49,104
9,Mentor10,Student54,100


In [11]:
# Score of the 2022 pairing

score_2022 = 0

for index, row in data_pairing.iterrows():
    score_2022 += score(row['Student'],row['Mentor'])
    
score_2022

2564