In [25]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path

In [26]:
files = [
    "train_labels.csv",
    "train.csv",
    "test.csv",
    "test_users.csv",
    "referer_vectors.csv",
    "geo_info.csv"
]


In [27]:
import os
upload_dir = "C:\Job\MLE\Predicting-the-gender-VK\data"

In [28]:
dataframes = {}

for file_name in files:
    file_path = os.path.join(upload_dir, file_name)
    try:
        
        df = pd.read_csv(file_path, sep=';')
        dataframes[file_name.replace(".csv", "")] = df
        print(f"Файл {file_name} загружен. Размер: {df.shape}")
    except FileNotFoundError:
        print(f"Ошибка: Файл {file_name} не найден по пути {file_path}")
    except Exception as e:
        print(f"Ошибка при загрузке файла {file_name}: {e}")

Файл train_labels.csv загружен. Размер: (500000, 2)
Файл train.csv загружен. Размер: (750000, 5)
Файл test.csv загружен. Размер: (150000, 5)
Файл test_users.csv загружен. Размер: (85000, 1)
Файл referer_vectors.csv загружен. Размер: (200000, 11)
Файл geo_info.csv загружен. Размер: (5533, 4)


In [29]:
import matplotlib as plt
#train_labels
print("\n[train_labels.csv] Метки для обучения")
df_labels = dataframes.get("train_labels")
if df_labels is not None:
    print(df_labels.head())
    print(df_labels.info())
    print("Распределение целевой переменной (target):")
    print(df_labels['target'].value_counts(normalize=True))



[train_labels.csv] Метки для обучения
                            user_id  target
0  fb858e8e0a2bec074450eaf94b627fd3       0
1  46a5f128fd569c764a92c2eaa788095e       0
2  5a74e9ac53ffb21a20cce117c0ad77ba       0
3  af735816ca19115431ae3d89518c8c91       0
4  364f0ae0a3f29a685c4fb5bae6033b9a       0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  500000 non-null  object
 1   target   500000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.6+ MB
None
Распределение целевой переменной (target):
target
0    0.52304
1    0.47696
Name: proportion, dtype: float64


In [30]:
#train
print("\n[train.csv] Данные для обучения")
df_train = dataframes.get("train")
if df_train is not None:
    print(df_train.head())
    print(df_train.info())
    print("Количество уникальных значений в столбцах:")
    print(df_train.nunique())


[train.csv] Данные для обучения
   request_ts                           user_id                   referer  \
0  1701011363  fb858e8e0a2bec074450eaf94b627fd3          https://9b48ee5/   
1  1700986581  46a5f128fd569c764a92c2eaa788095e          https://9b48ee5/   
2  1701011071  5a74e9ac53ffb21a20cce117c0ad77ba  https://9634fd0/1409e548   
3  1700992803  af735816ca19115431ae3d89518c8c91          https://9b48ee5/   
4  1701021666  364f0ae0a3f29a685c4fb5bae6033b9a          https://9b48ee5/   

   geo_id                                         user_agent  
0    4799  {'browser': 'Chrome Mobile', 'browser_version'...  
1    8257  {'browser': 'Chrome Mobile', 'browser_version'...  
2    3150  {'browser': 'Yandex Browser', 'browser_version...  
3    2740  {'browser': 'Chrome Mobile', 'browser_version'...  
4    4863  {'browser': 'Yandex Browser', 'browser_version...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 5 columns):
 #   Column     

In [31]:
#test
print("\n[test.csv] Данные для тестирования")
df_test = dataframes.get("test")
if df_test is not None:
    print(df_test.head())
    print(df_test.info())
    print("Количество уникальных значений в столбцах:")
    print(df_test.nunique())


[test.csv] Данные для тестирования
   request_ts                           user_id                   referer  \
0  1700993094  c2802dadd33d8ae09bb366bdd41212ea          https://9b48ee5/   
1  1701005579  e5b1988db74527ec092f28b0bbfdaac9          https://9b48ee5/   
2  1700969752  6ef1eedbdb72554e53e69782066065c5  https://72879b4/12411b9e   
3  1700991608  7e057293ecae62985a327b7af51858ea          https://9b48ee5/   
4  1701019815  a27bd7ce8828497823fa8d5d05e7bbf7          https://9b48ee5/   

   geo_id                                         user_agent  
0    8816  {'browser': 'Chrome Mobile', 'browser_version'...  
1    3663  {'browser': 'Chrome', 'browser_version': '116....  
2    2336  {'browser': 'Chrome', 'browser_version': '114....  
3    9652  {'browser': 'Chrome Mobile', 'browser_version'...  
4    3871  {'browser': 'Chrome Mobile', 'browser_version'...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 5 columns):
 #   Column  

In [32]:
#referer_vectors
print("\n[referer_vectors.csv] Векторы URL-адресов")
df_vectors = dataframes.get("referer_vectors")
if df_vectors is not None:
    print(df_vectors.head())
    print(df_vectors.info())
    print(f"Количество уникальных referer: {df_vectors['referer'].nunique()}")


[referer_vectors.csv] Векторы URL-адресов
   component0  component1  component2  component3  component4  component5  \
0       16708       -3741       11395       -1597       -3212        6269   
1       11731        4045       22213       -1184       -8992        9381   
2       10551        2947       12282        -470       16222        4472   
3       12816       20498      -10110        7731        -569       12035   
4        3710       11096       11333       14673        8030        1852   

   component6  component7  component8  component9                   referer  
0        5610      -15351       13779       14102  https://a6899a4/15652e67  
1       -3496       -3120        -899       16817          https://9b48ee5/  
2       -3316        9606        4197       18948  https://7a4c700/161af7e3  
3        3014        6398       11439        -271  https://9653126/159bc361  
4       10554       11625        4306       13210  https://72879b4/125c29e6  
<class 'pandas.core.frame.

In [33]:
#geo_info
print("\n[geo_info.csv] Географическая информация")
df_geo = dataframes.get("geo_info")
if df_geo is not None:
    print(df_geo.head())
    print(df_geo.info())
    print("Количество уникальных значений в столбцах:")
    print(df_geo.nunique())


[geo_info.csv] Географическая информация
   geo_id country_id region_id timezone_id
0    6447     c31b4e    470e75      f6155e
1    8730     a0a6e9       NaN      d816ca
2    7769     e878d4       NaN      ec4385
3    7330     c31b4e    23f9c2      f6155e
4     600     c31b4e    6dbc37      e56e80
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5533 entries, 0 to 5532
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   geo_id       5533 non-null   int64 
 1   country_id   5533 non-null   object
 2   region_id    3826 non-null   object
 3   timezone_id  5533 non-null   object
dtypes: int64(1), object(3)
memory usage: 173.0+ KB
None
Количество уникальных значений в столбцах:
geo_id         5533
country_id      203
region_id       277
timezone_id     314
dtype: int64


In [34]:
#test_users
print("\n[test_users.csv] Пользователи для предсказания")
df_test_users = dataframes.get("test_users")
if df_test_users is not None:
    print(df_test_users.head())
    print(df_test_users.info())
    print(f"Количество пользователей для предсказания: {df_test_users.shape[0]}")


[test_users.csv] Пользователи для предсказания
                            user_id
0  c2802dadd33d8ae09bb366bdd41212ea
1  e5b1988db74527ec092f28b0bbfdaac9
2  6ef1eedbdb72554e53e69782066065c5
3  7e057293ecae62985a327b7af51858ea
4  a27bd7ce8828497823fa8d5d05e7bbf7
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85000 entries, 0 to 84999
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  85000 non-null  object
dtypes: object(1)
memory usage: 664.2+ KB
None
Количество пользователей для предсказания: 85000


In [35]:
#Отображение
display(
    df_labels,
    df_train,
    df_geo,
    df_test,
    df_test_users,
    df_vectors
)

Unnamed: 0,user_id,target
0,fb858e8e0a2bec074450eaf94b627fd3,0
1,46a5f128fd569c764a92c2eaa788095e,0
2,5a74e9ac53ffb21a20cce117c0ad77ba,0
3,af735816ca19115431ae3d89518c8c91,0
4,364f0ae0a3f29a685c4fb5bae6033b9a,0
...,...,...
499995,11516096d9f97463424523e6154d5255,1
499996,d078ecb7c7e9eaf65189fb3d1bed7871,0
499997,3543d64627ead3a519e3199834e2a148,0
499998,37df5ff1d739f61d442b164db6281e46,0


Unnamed: 0,request_ts,user_id,referer,geo_id,user_agent
0,1701011363,fb858e8e0a2bec074450eaf94b627fd3,https://9b48ee5/,4799,"{'browser': 'Chrome Mobile', 'browser_version'..."
1,1700986581,46a5f128fd569c764a92c2eaa788095e,https://9b48ee5/,8257,"{'browser': 'Chrome Mobile', 'browser_version'..."
2,1701011071,5a74e9ac53ffb21a20cce117c0ad77ba,https://9634fd0/1409e548,3150,"{'browser': 'Yandex Browser', 'browser_version..."
3,1700992803,af735816ca19115431ae3d89518c8c91,https://9b48ee5/,2740,"{'browser': 'Chrome Mobile', 'browser_version'..."
4,1701021666,364f0ae0a3f29a685c4fb5bae6033b9a,https://9b48ee5/,4863,"{'browser': 'Yandex Browser', 'browser_version..."
...,...,...,...,...,...
749995,1701009372,74a5eebf811cd93a77cd5fd8efe24724,https://8807153/,4593,"{'browser': 'Opera', 'browser_version': '104.0..."
749996,1701000377,e892fb57ef63a2a3262f879e0943d0c4,https://6a81948/14217aec,4235,"{'browser': 'Yandex Browser', 'browser_version..."
749997,1701015280,71931d84e7fc468bcb9f1b6f94ef14e1,https://bc69dee/15f0df79,5836,"{'browser': 'Chrome Mobile', 'browser_version'..."
749998,1701018106,27546a0fe0cf63b2759ec1879b46a8e6,https://72879b4/131bd916,1198,"{'browser': 'Chrome', 'browser_version': '118...."


Unnamed: 0,geo_id,country_id,region_id,timezone_id
0,6447,c31b4e,470e75,f6155e
1,8730,a0a6e9,,d816ca
2,7769,e878d4,,ec4385
3,7330,c31b4e,23f9c2,f6155e
4,600,c31b4e,6dbc37,e56e80
...,...,...,...,...
5528,4673,c31b4e,71cac2,e56e80
5529,8464,eba88b,7eb431,cde24c
5530,6960,eba88b,6e7b31,b7096b
5531,871,c31b4e,895e40,109a080


Unnamed: 0,request_ts,user_id,referer,geo_id,user_agent
0,1700993094,c2802dadd33d8ae09bb366bdd41212ea,https://9b48ee5/,8816,"{'browser': 'Chrome Mobile', 'browser_version'..."
1,1701005579,e5b1988db74527ec092f28b0bbfdaac9,https://9b48ee5/,3663,"{'browser': 'Chrome', 'browser_version': '116...."
2,1700969752,6ef1eedbdb72554e53e69782066065c5,https://72879b4/12411b9e,2336,"{'browser': 'Chrome', 'browser_version': '114...."
3,1700991608,7e057293ecae62985a327b7af51858ea,https://9b48ee5/,9652,"{'browser': 'Chrome Mobile', 'browser_version'..."
4,1701019815,a27bd7ce8828497823fa8d5d05e7bbf7,https://9b48ee5/,3871,"{'browser': 'Chrome Mobile', 'browser_version'..."
...,...,...,...,...,...
149995,1700968705,3c86b41e45cc2b7029d969e4e045fc8d,https://8624dd2/,4106,"{'browser': 'Chrome', 'browser_version': '109...."
149996,1700989358,8daa3e87a079f857f559062fbf2c02e1,https://72879b4/13cb1eab,4863,"{'browser': 'Yandex Browser', 'browser_version..."
149997,1701025826,8bd30a0a06100a0c782046ec151fcf03,https://6a81948/,1724,"{'browser': 'YandexSearch', 'browser_version':..."
149998,1700979975,0bba77ee4a9a5f03c7dac143b77a7a0d,https://650870a/142422ab,6436,"{'browser': 'Chrome', 'browser_version': '87.0..."


Unnamed: 0,user_id
0,c2802dadd33d8ae09bb366bdd41212ea
1,e5b1988db74527ec092f28b0bbfdaac9
2,6ef1eedbdb72554e53e69782066065c5
3,7e057293ecae62985a327b7af51858ea
4,a27bd7ce8828497823fa8d5d05e7bbf7
...,...
84995,7f18ead960fd4762767a40e58c0f2237
84996,336f6e34fdaa6726c4881fe4f9576bce
84997,e8e2fe7f5a37fd89df87062da82aa891
84998,676e7b7340fbed94ee733109d09e4688


Unnamed: 0,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,referer
0,16708,-3741,11395,-1597,-3212,6269,5610,-15351,13779,14102,https://a6899a4/15652e67
1,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,https://9b48ee5/
2,10551,2947,12282,-470,16222,4472,-3316,9606,4197,18948,https://7a4c700/161af7e3
3,12816,20498,-10110,7731,-569,12035,3014,6398,11439,-271,https://9653126/159bc361
4,3710,11096,11333,14673,8030,1852,10554,11625,4306,13210,https://72879b4/125c29e6
...,...,...,...,...,...,...,...,...,...,...,...
199995,10449,-7968,10330,4252,8831,4058,-3683,7552,2472,24698,https://8699526/13cc03f9
199996,6861,-2246,8407,9086,10454,10448,-3699,9946,11660,18680,https://6a81948/12b326c4
199997,5746,-3774,22352,2209,6019,2715,-3414,-1253,6770,20793,https://991f545/1315010f
199998,8825,12584,-2406,11134,5405,3762,16805,8156,10331,11653,https://72879b4/16e9503b
