In [1]:
import copy

import pandas
from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
filename = "survey.csv"
df = pandas.read_csv(filename, sep=";")

In [3]:
__target_columns: list[str] = [
    "gender",
    "age",
    "heatlfy",
    "smoke",
    "stress",
    "sleep_well",
    "chronus",
    "wake_up",
    "sleep_time",
    "coffe_near",
    "gourmet",
    "office",
    "home_seater",
    "ill",
]

In [7]:
target_df = df[__target_columns]

In [8]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

# Prepare enums to number
target_df.loc[:, 'gender'] = target_df['gender'].replace({'Женщина': 0, 'Мужчина': 1})
target_df.loc[:, 'chronus'] = target_df['chronus'].replace({'Сова': 0, 'Жаворонок': 1})
target_df.loc[:, 'wake_up'] = target_df['wake_up'].apply(lambda x: int(x.replace(":", "")))

binary_columns = [
    "smoke", "sleep_well", "coffe_near", "gourmet", "office", "home_seater", "ill"
]
for c in binary_columns:
    target_df.loc[:, c] = target_df[c].replace({'Да': 1, 'Нет': 0})
target_df.loc[:, 'chronus'] = target_df['chronus'].replace({'Сова': 0, 'Жаворонок': 1})
filtered_df = target_df

In [9]:
filtered_df = filtered_df.astype(float)
filtered_df

Unnamed: 0,gender,age,heatlfy,smoke,stress,sleep_well,chronus,wake_up,sleep_time,coffe_near,gourmet,office,home_seater,ill
0,0.0,22.0,65.0,1.0,55.0,1.0,1.0,800.0,8.0,1.0,1.0,1.0,0.0,0.0
1,1.0,21.0,85.0,0.0,70.0,1.0,1.0,600.0,8.0,1.0,0.0,0.0,0.0,0.0
2,1.0,22.0,50.0,1.0,70.0,1.0,0.0,900.0,7.0,1.0,1.0,0.0,1.0,0.0
3,1.0,22.0,80.0,1.0,60.0,1.0,0.0,900.0,6.0,1.0,1.0,0.0,0.0,1.0
4,0.0,23.0,50.0,0.0,90.0,0.0,0.0,1400.0,6.0,0.0,0.0,0.0,1.0,1.0
5,1.0,21.0,99.0,0.0,1.0,1.0,1.0,630.0,8.0,0.0,1.0,0.0,0.0,0.0
6,1.0,22.0,50.0,0.0,70.0,1.0,0.0,1000.0,6.0,0.0,0.0,0.0,0.0,0.0
7,1.0,22.0,80.0,0.0,45.0,1.0,0.0,700.0,7.0,1.0,0.0,1.0,0.0,1.0
8,1.0,22.0,50.0,0.0,70.0,0.0,0.0,0.0,8.0,1.0,1.0,0.0,1.0,0.0
9,1.0,21.0,70.0,1.0,100.0,0.0,0.0,30.0,6.0,0.0,0.0,0.0,1.0,0.0


In [10]:
filtered_df.all

<bound method DataFrame.all of     gender   age  heatlfy  smoke  stress  sleep_well  chronus  wake_up  \
0      0.0  22.0     65.0    1.0    55.0         1.0      1.0    800.0   
1      1.0  21.0     85.0    0.0    70.0         1.0      1.0    600.0   
2      1.0  22.0     50.0    1.0    70.0         1.0      0.0    900.0   
3      1.0  22.0     80.0    1.0    60.0         1.0      0.0    900.0   
4      0.0  23.0     50.0    0.0    90.0         0.0      0.0   1400.0   
5      1.0  21.0     99.0    0.0     1.0         1.0      1.0    630.0   
6      1.0  22.0     50.0    0.0    70.0         1.0      0.0   1000.0   
7      1.0  22.0     80.0    0.0    45.0         1.0      0.0    700.0   
8      1.0  22.0     50.0    0.0    70.0         0.0      0.0      0.0   
9      1.0  21.0     70.0    1.0   100.0         0.0      0.0     30.0   
10     1.0  21.0     99.0    0.0    70.0         1.0      0.0    930.0   
11     1.0  22.0     80.0    1.0    10.0         1.0      1.0    600.0   
12     

In [11]:
scaler = preprocessing.Normalizer()
scaled_x = scaler.fit_transform(filtered_df.values)
scaled_df = pandas.DataFrame(scaled_x)
scaled_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,0.027334,0.080759,0.001242,0.068335,0.001242,0.001242,0.993958,0.00994,0.001242,0.001242,0.001242,0.0,0.0
1,0.001638,0.034402,0.139244,0.0,0.114672,0.001638,0.001638,0.982902,0.013105,0.001638,0.0,0.0,0.0,0.0
2,0.001106,0.024326,0.055285,0.001106,0.077399,0.001106,0.0,0.995135,0.00774,0.001106,0.001106,0.0,0.001106,0.0
3,0.001104,0.024287,0.088317,0.001104,0.066238,0.001104,0.0,0.993565,0.006624,0.001104,0.001104,0.0,0.0,0.001104
4,0.0,0.016382,0.035613,0.0,0.064103,0.0,0.0,0.997163,0.004274,0.0,0.0,0.0,0.000712,0.000712
5,0.001567,0.032909,0.155141,0.0,0.001567,0.001567,0.001567,0.987258,0.012537,0.0,0.001567,0.0,0.0,0.0
6,0.000996,0.021913,0.049803,0.0,0.069724,0.000996,0.0,0.996062,0.005976,0.0,0.0,0.0,0.0,0.0
7,0.001416,0.031145,0.113255,0.0,0.063706,0.001416,0.0,0.990978,0.00991,0.001416,0.0,0.001416,0.0,0.001416
8,0.011214,0.246709,0.560702,0.0,0.784982,0.0,0.0,0.0,0.089712,0.011214,0.011214,0.0,0.011214,0.0
9,0.007837,0.164586,0.548619,0.007837,0.783741,0.0,0.0,0.235122,0.047024,0.0,0.0,0.0,0.007837,0.0


In [12]:
user_vector = [0.001638,0.034402,0.139244,0.0,0.114672,0.001638,0.001638,0.982902,0.013105,0.001638,0.0,0.0,0.0,0.0]

In [13]:
diffs = cosine_similarity(scaled_x, [user_vector])

In [14]:
result_df = pandas.DataFrame(diffs, columns=['value'])

In [15]:
result_df

Unnamed: 0,value
0,0.997121
1,1.0
2,0.995637
3,0.997398
4,0.993043
5,0.993464
6,0.994797
7,0.998317
8,0.177789
9,0.403658


In [16]:
joined_result_df = copy.copy(df[["drink"]])
joined_result_df[['similarity_rate']] = result_df

In [17]:
sorted_list = joined_result_df.sort_values(by="similarity_rate", ascending=False)
sorted_list

Unnamed: 0,drink,similarity_rate
1,Чай,1.0
23,Чай,0.999895
17,Кофе,0.999699
24,Кофе,0.999024
16,Кофе,0.99886
10,Кофе,0.998509
7,Чай,0.998317
21,Чай,0.998312
20,Чай,0.997922
26,Чай,0.997717


In [18]:
head_neighbours = sorted_list.head(5)
head_neighbours

Unnamed: 0,drink,similarity_rate
1,Чай,1.0
23,Чай,0.999895
17,Кофе,0.999699
24,Кофе,0.999024
16,Кофе,0.99886


In [20]:
target_values = head_neighbours['drink'].values
target_values

array(['Чай', 'Чай', 'Кофе', 'Кофе', 'Кофе'], dtype=object)

In [21]:
m = {}
for val in target_values:
    m[val] = m.get(val, 0) + 1

m

{'Чай': 2, 'Кофе': 3}

In [22]:
predicted_answer = max(m, key=m.get)
predicted_answer

'Кофе'

In [23]:
df

Unnamed: 0,num,gender,drink,age,heatlfy,smoke,eye_color,stress,sleep_well,chronus,wake_up,sleep_time,coffe_near,gourmet,office,home_seater,ill,hand,zodiak
0,1,Женщина,Кофе,22,65,Да,Серо-зеленый,55,Да,Жаворонок,8:00,8.0,Да,Да,Да,Нет,Нет,Правой,Козерог
1,2,Мужчина,Чай,21,85,Нет,Зеленый,70,Да,Жаворонок,6:00,8.0,Да,Нет,Нет,Нет,Нет,Левой,Стрелец
2,3,Мужчина,Кофе,22,50,Да,Голубой,70,Да,Сова,9:00,7.0,Да,Да,Нет,Да,Нет,Правой,Рак
3,4,Мужчина,Кофе,22,80,Да,Карий,60,Да,Сова,9:00,6.0,Да,Да,Нет,Нет,Да,Правой,Скорпион
4,5,Женщина,Кофе,23,50,Нет,Голубой,90,Нет,Сова,14:00,6.0,Нет,Нет,Нет,Да,Да,Левой,Скорпион
5,6,Мужчина,Чай,21,99,Нет,Зелёный,1,Да,Жаворонок,6:30,8.0,Нет,Да,Нет,Нет,Нет,Правой,Овен
6,7,Мужчина,Чай,22,50,Нет,Голубой,70,Да,Сова,10:00,6.0,Нет,Нет,Нет,Нет,Нет,Правой,Рыбы
7,8,Мужчина,Чай,22,80,Нет,Голубой,45,Да,Сова,7:00,7.0,Да,Нет,Да,Нет,Да,Правой,Водолей
8,9,Мужчина,Кофе,22,50,Нет,Коричневый,70,Нет,Сова,0:00,8.0,Да,Да,Нет,Да,Нет,Левой,Дева
9,10,Мужчина,Чай,21,70,Да,Зеленый,100,Нет,Сова,0:30,6.0,Нет,Нет,Нет,Да,Нет,Правой,Водолей


In [24]:
tf = copy.copy(df)
tf

Unnamed: 0,num,gender,drink,age,heatlfy,smoke,eye_color,stress,sleep_well,chronus,wake_up,sleep_time,coffe_near,gourmet,office,home_seater,ill,hand,zodiak
0,1,Женщина,Кофе,22,65,Да,Серо-зеленый,55,Да,Жаворонок,8:00,8.0,Да,Да,Да,Нет,Нет,Правой,Козерог
1,2,Мужчина,Чай,21,85,Нет,Зеленый,70,Да,Жаворонок,6:00,8.0,Да,Нет,Нет,Нет,Нет,Левой,Стрелец
2,3,Мужчина,Кофе,22,50,Да,Голубой,70,Да,Сова,9:00,7.0,Да,Да,Нет,Да,Нет,Правой,Рак
3,4,Мужчина,Кофе,22,80,Да,Карий,60,Да,Сова,9:00,6.0,Да,Да,Нет,Нет,Да,Правой,Скорпион
4,5,Женщина,Кофе,23,50,Нет,Голубой,90,Нет,Сова,14:00,6.0,Нет,Нет,Нет,Да,Да,Левой,Скорпион
5,6,Мужчина,Чай,21,99,Нет,Зелёный,1,Да,Жаворонок,6:30,8.0,Нет,Да,Нет,Нет,Нет,Правой,Овен
6,7,Мужчина,Чай,22,50,Нет,Голубой,70,Да,Сова,10:00,6.0,Нет,Нет,Нет,Нет,Нет,Правой,Рыбы
7,8,Мужчина,Чай,22,80,Нет,Голубой,45,Да,Сова,7:00,7.0,Да,Нет,Да,Нет,Да,Правой,Водолей
8,9,Мужчина,Кофе,22,50,Нет,Коричневый,70,Нет,Сова,0:00,8.0,Да,Да,Нет,Да,Нет,Левой,Дева
9,10,Мужчина,Чай,21,70,Да,Зеленый,100,Нет,Сова,0:30,6.0,Нет,Нет,Нет,Да,Нет,Правой,Водолей


In [25]:
joined_result_df = pandas.DataFrame(
    {
        "drink": df["drink"].values.tolist(),
        "similarity_rate": diffs.tolist()
    }
)
joined_result_df

Unnamed: 0,drink,similarity_rate
0,Кофе,[0.997121182440482]
1,Чай,[0.9999999999996797]
2,Кофе,[0.995637260916448]
3,Кофе,[0.9973976676372884]
4,Кофе,[0.9930427900973475]
5,Чай,[0.9934639478951232]
6,Чай,[0.9947969373050491]
7,Чай,[0.9983170577961934]
8,Кофе,[0.17778943817905465]
9,Чай,[0.40365839719928215]


In [26]:
head_neighbours.value_counts()

drink  similarity_rate
Кофе   0.998860           1
       0.999024           1
       0.999699           1
Чай    0.999895           1
       1.000000           1
Name: count, dtype: int64

In [27]:
head_neighbours

Unnamed: 0,drink,similarity_rate
1,Чай,1.0
23,Чай,0.999895
17,Кофе,0.999699
24,Кофе,0.999024
16,Кофе,0.99886


In [28]:
from collections import Counter

In [29]:
c = Counter(target_values)


In [30]:
dict(c)

{'Чай': 2, 'Кофе': 3}