### Собираем информацию с ВК

In [None]:
import time
import pandas as pd
from tqdm import tqdm
from utils.vk_data_collector.vk_utils import get_vk_session, get_user_info, get_groups_items
from config import ACCESS_TOKENS, VK_API_VERSION
import random

#### Собираем информацию о пользователях

In [None]:
df = pd.read_csv('data/data_Holland.csv', delimiter=';')
df = df.drop(columns='completion_date').rename(columns={'1':'R', '2':'I', '3':'A', '4':'S', '5':'E', '6':'C', 'id':'user_id'})
df = df.drop_duplicates(subset='user_id')
vk = get_vk_session(ACCESS_TOKENS[0])

In [None]:
for i, row in tqdm(df.iterrows(), total=len(df)):
    user_id = int(row['user_id'])
    try:
        user_info = get_user_info(user_id, vk)
        df.at[i, 'sex'] = user_info['sex']
        df.at[i, 'is_closed'] = user_info['is_closed']
        df.at[i, 'deactivated'] = user_info['deactivated']
        time.sleep(0.1)
    except Exception as e:
        print(f"Error processing user {user_id}: {e}")

df.to_csv('data/RIASEC_NEW.csv', index=False, encoding='utf-8')

  0%|          | 10/3265 [00:03<19:35,  2.77it/s]


#### Собираем информацию о группах

In [None]:
df = pd.read_csv('data/RIASEC_NEW.csv')
df = df[df['is_closed'] == False]
df_with_groups = pd.DataFrame(columns=['user_id', 'count', 'items'])

for i, row in tqdm(df[:10].iterrows(), total=len(df)):
    user_id = int(row['user_id'])
    try:
        items = get_groups_items(user_id, random.choice(ACCESS_TOKENS))
        df_with_groups.at[i, 'user_id'] = user_id
        df_with_groups.at[i, 'count'] = len(items)
        df_with_groups.at[i, 'items'] = items
        time.sleep(0.5)
    except Exception as e:
        print(f"Error fetching groups for user {user_id}: {e}")

df_with_groups.to_csv('data/groups_new.csv', index=False)

  0%|          | 10/2017 [00:07<26:42,  1.25it/s]


### Очищаем данные

In [None]:
import pandas as pd
from utils.vk_data_cleaner.group_processing import process_group_data

In [2]:
df = pd.read_csv('data/groups_new.csv', sep=',')

удаляем закрытые аккаунты и где групп меньше 10

In [3]:
df = df[df['count'] >= 10]
df.dropna(subset=['count'], inplace=True)
df.reset_index(inplace=True, drop=True)
len(df)

1822

In [None]:
cleaned_df = process_group_data(df)

100%|██████████| 10/10 [00:00<00:00, 37.83it/s]


In [None]:
cleaned_df.to_csv('data/groups_cleaned_less_10_groups.csv', index=False)

### Укрупнение activity

In [21]:
import pandas as pd
from utils.activity_mapper.activity_utils import load_topics
from utils.vk_data_cleaner.consolidation_activities import consolidation_group_data

In [15]:
df = pd.read_csv('data/groups_cleaned_less_10_groups.csv')
combined_tems = load_topics('data/Topics.xlsx')

In [None]:
df = consolidation_group_data(df, combined_tems)

  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 15/15 [00:02<00:00,  5.28it/s]


In [18]:
activities = set()
for i in range(len(df)):
    for group in df.at[i, 'items']:
        activities.add(group['enlarged_activity'])

In [20]:
print(f'Уникальных enlarged_activity: {len(activities)}')

Уникальных enlarged_activity: 42


In [None]:
df.to_csv('data/groups_cleaned_with_enlarged_activity_less_10_groups.csv', index=False)

### Препроцессинг

In [1]:
import pandas as pd
import ast
from tqdm import tqdm
from collections import Counter
from utils.preprocessing.group_processing import process_group_data, combine_descriptions

In [2]:
df = pd.read_csv('data/groups_cleaned_with_enlarged_activity_less_10_groups.csv')

In [3]:
activities = []
for i in tqdm(range(len(df['items']))):
  groups = ast.literal_eval(df.at[i, 'items'])
  for group in groups:
    activities.append(group['enlarged_activity'])

100%|██████████| 1822/1822 [00:31<00:00, 57.76it/s]


In [4]:
Counter(activities).most_common(10)

[('Развлечения', 64573),
 ('Увлечения и хобби', 50471),
 ('Красота, здоровье', 30634),
 ('Персона', 21100),
 ('Образование', 19057),
 ('Города, страны', 16275),
 ('Объединения, группы людей', 12104),
 ('Рестораны', 12049),
 ('Отношения, семья', 10847),
 ('Компьютер, интернет', 10314)]

In [None]:
df = process_group_data(df)
df = combine_descriptions(df)

100%|██████████| 10/10 [00:05<00:00,  1.84it/s]


In [None]:
df.to_csv('data/groups_for_llm_with_enlarged_activity_less_10_groups.csv', index=False)

### Получение эмбеддингов

In [61]:
from utils.embeddings.generate.generate_rubert import generate_rubert_embeddings
from utils.embeddings.generate.generate_gigachat import generate_gigachat_embeddings
from utils.embeddings.generate.generate_yandex import generate_yandex_embeddings

In [None]:
generate_rubert_embeddings("data/groups_for_llm_with_enlarged_activity_less_10_groups.csv", "data/embeddings/user_id_embeddings_bert.csv")
generate_gigachat_embeddings("data/groups_for_llm_with_enlarged_activity_less_10_groups.csv", "data/embeddings/user_id_embeddings_sber.csv")
generate_yandex_embeddings("data/groups_for_llm_with_enlarged_activity_less_10_groups.csv", "data/embeddings/user_id_embeddings_yandex.csv")

  0%|          | 9/1822 [00:43<2:48:33,  5.58s/it]

### Уменьшение размерности

In [4]:
from utils.dimensionality.reduce_pca import reduce_embeddings_pca
from utils.dimensionality.reduce_autoencoder import reduce_embeddings_autoencoder

In [None]:

reduce_embeddings_pca(
    folder_path="data/embeddings/",
    output_path="data/output/embeddings_pca/"
)

reduce_embeddings_autoencoder(
    folder_path="data/embeddings/",
    output_path="data/output/embeddings_autoencoded/"
)


### Предсказание 

In [None]:
import os
from utils.models.hybrid_nn import train_and_predict as hybrid_nn_predict
from utils.models.catboost_model import train_and_predict as catboost_predict
from utils.models.xgboost_model import train_and_predict as xgboost_predict
from utils.models.random_forest_model import train_and_predict as rf_predict
from utils.models.utils import archive_output

In [None]:
folder_paths = ["data/output/pca_emb/", "data/output/autoencoder_emb/"]
output_dirs = ["output/pca_emb/", "output/autoencoder_emb/"]

for folder_path, output_dir in zip(folder_paths, output_dirs):
    os.makedirs(output_dir, exist_ok=True)
    files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

    for file in files:
        file_path = os.path.join(folder_path, file)
        output_path = os.path.join(output_dir, file)

        print(f"\nProcessing {file_path}")

        hybrid_nn_predict(file_path, output_path)

        catboost_predict(file_path, output_path)

        xgboost_predict(file_path, output_path)

        rf_predict(file_path, output_path)

for output_dir in output_dirs:
    archive_name = output_dir.replace("/", "_").rstrip("_")
    archive_output(output_dir, archive_name)

### Сравнение 

In [1]:
from utils.compare.evaluate_results import evaluate_models
from utils.compare.visualize_results import plot_rmse_comparison, plot_matching_comparison

In [None]:
results = evaluate_models()

rmse_summary = {}
for model, emb_methods in results.items():
    rmse_summary[model] = {}
    for emb_type, files in emb_methods.items():
        for file_key, metrics in files.items():
            if file_key not in rmse_summary[model]:
                rmse_summary[model][file_key] = []
            rmse_summary[model][file_key].append(metrics['RMSE_total'])

number_of_matching_summary = {}
for model, emb_methods in results.items():
    number_of_matching_summary[model] = {}
    for emb_type, files in emb_methods.items():
        for file_key, metrics in files.items():
            if file_key not in number_of_matching_summary[model]:
                number_of_matching_summary[model][file_key] = []
            number_of_matching_summary[model][file_key].append(metrics['number_of_matching_types'])

plot_rmse_comparison(rmse_summary)
plot_matching_comparison(number_of_matching_summary)