In [18]:
import joblib
import pandas as pd
from sklearn.preprocessing import (
    LabelBinarizer, 
    LabelEncoder,
    OneHotEncoder,
)
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
df = pd.read_csv("ТИУ Студенты 2019-2024.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)
df.head(5)

Unnamed: 0,Пол,Иностранное гражданство,Служба в армии,Ср. балл док-та об образовании,Сумма баллов,Сумма баллов за индивидуальные достижения,Направление подготовки,Русский язык,Обществознание,мат,Физика,Химия,История,Информатика и информационно–коммуникационные технологии (ИКТ)
0,Ж,Россия,0,4.667,210,10,27.03.01 Стандартизация и метрология,87,0,56,57,0,0,0
1,М,Киргизия,0,3.7,190,0,08.03.01 Строительство,72,0,60,58,0,0,0
2,М,Россия,0,5.0,214,8,08.03.01 Строительство,85,0,70,51,0,0,0
3,М,Россия,0,4.286,208,10,13.03.01 Теплоэнергетика и теплотехника,76,0,70,52,0,0,0
4,М,Узбекистан,0,4.826,189,0,08.03.01 Строительство,68,0,60,61,0,0,0


In [20]:
df["Пол"] = df["Пол"].apply(
    lambda x: "male" if x == "М" else "female"
)
df["Пол"].value_counts()

Пол
male      7143
female    3503
Name: count, dtype: int64

In [21]:
df["Служба в армии"] = df["Служба в армии"].apply(
    lambda x: "yes" if x == 1 else "no"
)
df["Служба в армии"].value_counts()

Служба в армии
no     10566
yes       80
Name: count, dtype: int64

In [22]:
df.head(3)

Unnamed: 0,Пол,Иностранное гражданство,Служба в армии,Ср. балл док-та об образовании,Сумма баллов,Сумма баллов за индивидуальные достижения,Направление подготовки,Русский язык,Обществознание,мат,Физика,Химия,История,Информатика и информационно–коммуникационные технологии (ИКТ)
0,female,Россия,no,4.667,210,10,27.03.01 Стандартизация и метрология,87,0,56,57,0,0,0
1,male,Киргизия,no,3.7,190,0,08.03.01 Строительство,72,0,60,58,0,0,0
2,male,Россия,no,5.0,214,8,08.03.01 Строительство,85,0,70,51,0,0,0


In [23]:
columns = {
    "Пол": "gender",
    "Иностранное гражданство": "foreign_citizenship",
    "Служба в армии": "military_service",
    "Ср. балл док-та об образовании": "gpa",
    "Сумма баллов": "points",
    "Сумма баллов за индивидуальные достижения": "bonus_points",
    "Русский язык": "russian",
    "Обществознание": "social_science",
    "мат": "math",
    "Физика": "physics",
    "Химия": "chemistry",
    "История": "history",
    "Информатика и информационно–коммуникационные технологии (ИКТ)": "informatics",
    "Направление подготовки": "direction"
}
df = df.rename(columns=columns)
df.head(3)

Unnamed: 0,gender,foreign_citizenship,military_service,gpa,points,bonus_points,direction,russian,social_science,math,physics,chemistry,history,informatics
0,female,Россия,no,4.667,210,10,27.03.01 Стандартизация и метрология,87,0,56,57,0,0,0
1,male,Киргизия,no,3.7,190,0,08.03.01 Строительство,72,0,60,58,0,0,0
2,male,Россия,no,5.0,214,8,08.03.01 Строительство,85,0,70,51,0,0,0


In [24]:
gender_lb = LabelBinarizer()
gender_lb.fit(df["gender"])

In [25]:
df["gender"] = gender_lb.transform(df["gender"])
df["gender"].value_counts()

gender
1    7143
0    3503
Name: count, dtype: int64

In [26]:
joblib.dump(gender_lb, "gender_binarizer.joblib")

['gender_binarizer.joblib']

In [27]:
military_service_lb = LabelBinarizer()
military_service_lb.fit(df["military_service"])
df["military_service"] = military_service_lb.transform(df["military_service"])
df["military_service"].value_counts()

military_service
0    10566
1       80
Name: count, dtype: int64

In [28]:
joblib.dump(military_service_lb, "milirary_service_binarizer.joblib")

['milirary_service_binarizer.joblib']

In [29]:
le = LabelEncoder()
le.fit(df["foreign_citizenship"])
df["foreign_citizenship"] = le.transform(df["foreign_citizenship"])
df["foreign_citizenship"].value_counts()

foreign_citizenship
6     9773
3      613
9      190
7       44
4       17
0        3
2        2
10       1
1        1
5        1
8        1
Name: count, dtype: int64

In [30]:
df.head(5)

Unnamed: 0,gender,foreign_citizenship,military_service,gpa,points,bonus_points,direction,russian,social_science,math,physics,chemistry,history,informatics
0,0,6,0,4.667,210,10,27.03.01 Стандартизация и метрология,87,0,56,57,0,0,0
1,1,4,0,3.7,190,0,08.03.01 Строительство,72,0,60,58,0,0,0
2,1,6,0,5.0,214,8,08.03.01 Строительство,85,0,70,51,0,0,0
3,1,6,0,4.286,208,10,13.03.01 Теплоэнергетика и теплотехника,76,0,70,52,0,0,0
4,1,9,0,4.826,189,0,08.03.01 Строительство,68,0,60,61,0,0,0


In [31]:
applicant_df = df.drop("direction", axis=1)
applicant_df.head(3)

Unnamed: 0,gender,foreign_citizenship,military_service,gpa,points,bonus_points,russian,social_science,math,physics,chemistry,history,informatics
0,0,6,0,4.667,210,10,87,0,56,57,0,0,0
1,1,4,0,3.7,190,0,72,0,60,58,0,0,0
2,1,6,0,5.0,214,8,85,0,70,51,0,0,0


In [32]:
applicant_df.loc[0].corr(applicant_df.loc[900]), applicant_df.loc[0].corr(applicant_df.loc[6])

(0.9938412930669286, 0.9942428304273809)

In [33]:
df.to_csv("Applicant.csv")

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()
scaled = scaler.fit_transform(applicant_df)
scaled

array([[-1.42797386,  0.14507053, -0.08701411, ..., -0.11416697,
        -0.07572252, -0.25749855],
       [ 0.70029293, -2.25869727, -0.08701411, ..., -0.11416697,
        -0.07572252, -0.25749855],
       [ 0.70029293,  0.14507053, -0.08701411, ..., -0.11416697,
        -0.07572252, -0.25749855],
       ...,
       [-1.42797386,  0.14507053, -0.08701411, ..., -0.11416697,
        -0.07572252, -0.25749855],
       [ 0.70029293,  3.75072222, -0.08701411, ..., -0.11416697,
        -0.07572252, -0.25749855],
       [ 0.70029293, -3.46058117, -0.08701411, ..., -0.11416697,
        -0.07572252, -0.25749855]])

In [19]:
cosine_similarity(scaled[0].reshape(1, -1), scaled[6].reshape(1, -1))

array([[-0.21104513]])

In [20]:
joblib.dump(scaler, "standard_scaler.joblib")

['standard_scaler.joblib']

In [21]:
import chromadb

In [22]:
import uuid

client = chromadb.PersistentClient(path=r"C:\Users\andre\TyuiuDirectionsRecSys\chroma")

collection = client.get_collection("applicants")

for vector, direction in zip(scaled, df["direction"]):
    collection.add(
        ids=[str(uuid.uuid4())],
        embeddings=[vector.tolist()],
        metadatas={"direction": direction},
    )

: 