# Загрузка данных

In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
        
import warnings
warnings.filterwarnings('ignore')

/kaggle/input/dadadadadadadadadadadaddadadadadadadadadadadada/all_events.csv
/kaggle/input/dadadadadadadadadadadaddadadadadadadadadadadada/train_events.csv
/kaggle/input/dadadadadadadadadadadaddadadadadadadadadadadada/train_targets.csv
/kaggle/input/dadadadadadadadadadadaddadadadadadadadadadadada/video_info_v2.csv
/kaggle/input/dadadadadadadadadadadaddadadadadadadadadadadada/session_stats.csv


In [2]:
# Define data types for memory optimization
dtype_events = {
    'region': 'category',
    'ua_device_type': 'category',
    'ua_client_type': 'category',
    'ua_os': 'category',
    'ua_client_name': 'category',
    'total_watchtime': 'int32',
    'rutube_video_id': 'category',
    'viewer_uid': 'int32'
}

dtype_video_info = {
    'rutube_video_id': 'category',
    'title': 'object',
    'category': 'category',
    'duration': 'int32',
    'author_id': 'int32'
}

dtype_targets = {
    'viewer_uid': 'int32',
    'gender': 'category',
    'age': 'int8'
}

# Load train_events.csv
train_events = pd.read_csv('/kaggle/input/dadadadadadadadadadadaddadadadadadadadadadadada/train_events.csv', dtype=dtype_events, parse_dates=['event_timestamp'])
print(f"Train Events Shape: {train_events.shape}")

# Load video_info.csv
video_info = pd.read_csv('/kaggle/input/dadadadadadadadadadadaddadadadadadadadadadadada/video_info_v2.csv', dtype=dtype_video_info)
print(f"Video Info Shape: {video_info.shape}")

# Load train_targets.csv
train_targets = pd.read_csv('/kaggle/input/dadadadadadadadadadadaddadadadadadadadadadadada/train_targets.csv', dtype=dtype_targets)
print(f"Train Targets Shape: {train_targets.shape}")

Train Events Shape: (1759616, 9)
Video Info Shape: (481480, 5)
Train Targets Shape: (180012, 4)


# Перевод времени

In [3]:
region_timezones = {
    'Chelyabinsk': 'Asia/Yekaterinburg',
    'Bashkortostan Republic': 'Asia/Yekaterinburg',
    'St.-Petersburg': 'Europe/Moscow',
    'Moscow': 'Europe/Moscow',
    'Moscow Oblast': 'Europe/Moscow',
    'Tatarstan Republic': 'Europe/Moscow',
    'Novosibirsk Oblast': 'Asia/Novosibirsk',
    'Omsk Oblast': 'Asia/Omsk',
    'Chuvashia': 'Europe/Moscow',
    'Krasnoyarsk Krai': 'Asia/Krasnoyarsk',
    'Kamchatka': 'Asia/Kamchatka',
    'Nizhny Novgorod Oblast': 'Europe/Moscow',
    'Krasnodar Krai': 'Europe/Moscow',
    'Volgograd Oblast': 'Europe/Moscow',
    'Kaliningrad Oblast': 'Europe/Kaliningrad',
    'Kuzbass': 'Asia/Novosibirsk',
    'Stavropol Kray': 'Europe/Moscow',
    'Samara Oblast': 'Europe/Samara',
    'Amur Oblast': 'Asia/Yakutsk',
    'Sverdlovsk Oblast': 'Asia/Yekaterinburg',
    'Yamalo-Nenets': 'Asia/Yekaterinburg',
    'Orenburg Oblast': 'Asia/Yekaterinburg',
    'Khanty-Mansia': 'Asia/Yekaterinburg',
    'Kaluga Oblast': 'Europe/Moscow',
    'Tomsk Oblast': 'Asia/Novosibirsk',
    'Novgorod Oblast': 'Europe/Moscow',
    'Arkhangelskaya': 'Europe/Moscow',
    'North Ossetia–Alania': 'Europe/Moscow',
    'Kursk Oblast': 'Europe/Moscow',
    "Leningradskaya Oblast'": 'Europe/Moscow',
    'Krasnoyarskiy': 'Asia/Krasnoyarsk',
    'Ivanovo Oblast': 'Europe/Moscow',
    'Altay Kray': 'Asia/Barnaul',
    'Kurgan Oblast': 'Asia/Yekaterinburg',
    'Kostroma Oblast': 'Europe/Moscow',
    'Bryansk Oblast': 'Europe/Moscow',
    'Dagestan': 'Europe/Moscow',
    'Lipetsk Oblast': 'Europe/Moscow',
    'Vladimir Oblast': 'Europe/Moscow',
    'Kirov Oblast': 'Europe/Moscow',
    'Khabarovsk': 'Asia/Khabarovsk',
    'Tambov Oblast': 'Europe/Moscow',
    'Chukotka': 'Asia/Anadyr',
    'Voronezh Oblast': 'Europe/Moscow',
    'Sverdlovsk': 'Asia/Yekaterinburg',
    'Tula Oblast': 'Europe/Moscow',
    'Krasnodarskiy': 'Europe/Moscow',
    'Irkutsk Oblast': 'Asia/Irkutsk',
    'Saratov Oblast': 'Europe/Samara',
    'Khakasiya Republic': 'Asia/Krasnoyarsk',
    'Penza': 'Europe/Moscow',
    'Perm Krai': 'Asia/Yekaterinburg',
    'Oryol oblast': 'Europe/Moscow',
    'Vladimir': 'Europe/Moscow',
    'Smolensk Oblast': 'Europe/Moscow',
    'Penza Oblast': 'Europe/Moscow',
    'Mordoviya Republic': 'Europe/Moscow',
    'Tyumen’ Oblast': 'Asia/Yekaterinburg',
    'Sakha': 'Asia/Yakutsk',
    'Primorye': 'Asia/Vladivostok',
    'Zabaykalskiy (Transbaikal) Kray': 'Asia/Chita',
    'Vologda Oblast': 'Europe/Moscow',
    'Yaroslavl Oblast': 'Europe/Moscow',
    'Crimea': 'Europe/Moscow',
    'Rostov': 'Europe/Moscow',
    'Ryazan Oblast': 'Europe/Moscow',
    'Perm': 'Asia/Yekaterinburg',
    'Chechnya': 'Europe/Moscow',
    'Udmurtiya Republic': 'Asia/Yekaterinburg',
    'Tver Oblast': 'Europe/Moscow',
    'Buryatiya Republic': 'Asia/Ulan-Ude',
    'Belgorod Oblast': 'Europe/Moscow',
    'Kaluga': 'Europe/Moscow',
    'Astrakhan Oblast': 'Europe/Astrakhan',
    'Karelia': 'Europe/Moscow',
    'Murmansk': 'Europe/Moscow',
    'Adygeya Republic': 'Europe/Moscow',
    'Kemerovo Oblast': 'Asia/Novosibirsk',
    'Mariy-El Republic': 'Europe/Moscow',
    'Kursk': 'Europe/Moscow',
    'Saratovskaya Oblast': 'Europe/Samara',
    'Sakhalin Oblast': 'Asia/Vladivostok',
    'Ivanovo': 'Europe/Moscow',
    'Tyumen Oblast': 'Asia/Yekaterinburg',
    'Stavropol’ Kray': 'Europe/Moscow',
    'Voronezj': 'Europe/Moscow',
    'Karachayevo-Cherkesiya Republic': 'Europe/Moscow',
    'Kabardino-Balkariya Republic': 'Europe/Moscow',
    'Ulyanovsk': 'Europe/Moscow',
    'North Ossetia': 'Europe/Moscow',
    'Komi': 'Europe/Moscow',
    'Smolensk': 'Europe/Moscow',
    'Tver’ Oblast': 'Europe/Moscow',
    'Sebastopol City': 'Europe/Moscow',
    'Pskov Oblast': 'Europe/Moscow',
    'Tula': 'Europe/Moscow',
    'Orel Oblast': 'Europe/Moscow',
    'Jaroslavl': 'Europe/Moscow',
    'Tambov': 'Europe/Moscow',
    'Kalmykiya Republic': 'Europe/Moscow',
    'Primorskiy (Maritime) Kray': 'Asia/Vladivostok',
    'Altai': 'Asia/Barnaul',
    'Magadan Oblast': 'Asia/Magadan',
    'Vologda': 'Europe/Moscow',
    'Tyva Republic': 'Asia/Kyzyl',
    'Nenets': 'Europe/Moscow',
    'Smolenskaya Oblast’': 'Europe/Moscow',
    'Jewish Autonomous Oblast': 'Asia/Yakutsk',
    'Astrakhan': 'Europe/Astrakhan',
    'Ingushetiya Republic': 'Europe/Moscow',
    'Kirov': 'Europe/Moscow',
    'Transbaikal Territory': 'Asia/Chita',
    'Omsk': 'Asia/Omsk',
    'Kaliningrad': 'Europe/Kaliningrad',
    'Stavropol Krai': 'Europe/Moscow',
    'Arkhangelsk Oblast': 'Europe/Moscow',
}


from pytz import timezone

def convert_to_local_time(row):
    region = row['region']
    timestamp = row['event_timestamp']
    try:
        # Получаем часовой пояс региона, по умолчанию 'Europe/Moscow'
        tz = timezone(region_timezones.get(region, 'Europe/Moscow'))
    except:
        tz = timezone('Europe/Moscow')
    
    # Проверяем, является ли временная метка timezone-aware
    if timestamp.tzinfo is None or timestamp.tzinfo.utcoffset(timestamp) is None:
        # Если временная метка naive, локализуем её к московскому времени
        timestamp = timestamp.tz_localize('Europe/Moscow')
    
    # Преобразуем время в локальный часовой пояс региона
    return timestamp.astimezone(tz)


train_events['local_event_timestamp'] = train_events.apply(convert_to_local_time, axis=1)

# Объединение таблиц

In [4]:
merged_data = pd.merge(train_events, train_targets, on='viewer_uid', how='left')
merged_data = pd.merge(merged_data, video_info, on='rutube_video_id', how='left')

In [5]:
merged_data.drop(columns=['event_timestamp'], inplace=True)

In [6]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1759616 entries, 0 to 1759615
Data columns (total 16 columns):
 #   Column                 Dtype   
---  ------                 -----   
 0   region                 category
 1   ua_device_type         category
 2   ua_client_type         category
 3   ua_os                  category
 4   ua_client_name         category
 5   total_watchtime        int32   
 6   rutube_video_id        object  
 7   viewer_uid             int32   
 8   local_event_timestamp  object  
 9   age                    int8    
 10  sex                    object  
 11  age_class              int64   
 12  title                  object  
 13  category               category
 14  duration               int32   
 15  author_id              int32   
dtypes: category(6), int32(4), int64(1), int8(1), object(4)
memory usage: 105.7+ MB


# Пропуски

In [7]:
merged_data.isna().sum()

region                        0
ua_device_type                0
ua_client_type                0
ua_os                    117671
ua_client_name                0
total_watchtime               0
rutube_video_id               0
viewer_uid                    0
local_event_timestamp         0
age                           0
sex                           0
age_class                     0
title                         0
category                      0
duration                      0
author_id                     0
dtype: int64

In [8]:
merged_data[merged_data['ua_os'].isna()].sample(10)

merged_data.loc[merged_data['ua_os'].isna(), 'ua_os'] = 'Android'


print(merged_data['ua_os'].isna().sum())  # Должно быть 0

0


# Работа с категориальными признаками

## ОС

In [9]:
linux_distributions = ['GNU/Linux', 'Ubuntu', 'Debian', 'SUSE', 'CentOS', 'Fedora', 'NetBSD', 'OpenBSD']

merged_data['ua_os'] = merged_data['ua_os'].replace(linux_distributions, 'Linux')

merged_data['ua_os'].value_counts()

ua_os
Android                 1248707
Windows                  398698
Mac                       55179
iOS                       45672
Linux                      9306
iPadOS                     1189
Windows Phone               259
android tv                  191
Fire OS                      70
MeeGo                        67
BlackBerry OS                64
Windows CE                   57
BlackBerry Tablet OS         54
HarmonyOS                    36
KaiOS                        19
Symbian                      16
MocorDroid                   13
Windows RT                   10
Symbian OS Series 60          5
Chrome OS                     3
wear os                       1
Name: count, dtype: int64

In [10]:
to_replace = ['Fire OS', 'MeeGo', 'BlackBerry OS', 'Windows CE', 'BlackBerry Tablet OS', 'HarmonyOS', 
              'KaiOS', 'Symbian', 'MocorDroid', 'Windows RT', 'Symbian OS Series 60', 'Chrome OS', 'wear os', 'android tv', 'Windows Phone', 'iPadOS']

merged_data['ua_os'] = merged_data['ua_os'].replace(to_replace, 'Other')

In [11]:
merged_data['ua_os'].value_counts()

ua_os
Android    1248707
Windows     398698
Mac          55179
iOS          45672
Linux         9306
Other         2054
Name: count, dtype: int64

### ua_client_name

In [12]:
# Считаем количество появлений каждого ua_client_name
client_name_counts = merged_data['ua_client_name'].value_counts()

# Заменяем значения, которые встречаются меньше 5000 раз, на 'Other'
merged_data['ua_client_name'] = merged_data['ua_client_name'].apply(lambda x: x if client_name_counts[x] >= 5000 else 'Other')

# Проверяем результат
merged_data['ua_client_name'].value_counts()

ua_client_name
Rutube               1138818
Yandex Browser        230202
Chrome                161821
Chrome Mobile          62040
Mobile Safari          32955
Safari                 28641
Microsoft Edge         25502
Firefox Mobile         23985
Opera                  21373
Firefox                16887
Other                  11503
Chrome Mobile iOS       5889
Name: count, dtype: int64

In [13]:
merged_data['ua_client_name'].value_counts()

ua_client_name
Rutube               1138818
Yandex Browser        230202
Chrome                161821
Chrome Mobile          62040
Mobile Safari          32955
Safari                 28641
Microsoft Edge         25502
Firefox Mobile         23985
Opera                  21373
Firefox                16887
Other                  11503
Chrome Mobile iOS       5889
Name: count, dtype: int64

# Encoding

- region (label)
- ua_device_type (one-hot)
- ua_client_type (one-hot)
- ua_os (one-hot)
- ua_client_name (label)
- category (label)

In [14]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1759616 entries, 0 to 1759615
Data columns (total 16 columns):
 #   Column                 Dtype   
---  ------                 -----   
 0   region                 category
 1   ua_device_type         category
 2   ua_client_type         category
 3   ua_os                  category
 4   ua_client_name         object  
 5   total_watchtime        int32   
 6   rutube_video_id        object  
 7   viewer_uid             int32   
 8   local_event_timestamp  object  
 9   age                    int8    
 10  sex                    object  
 11  age_class              int64   
 12  title                  object  
 13  category               category
 14  duration               int32   
 15  author_id              int32   
dtypes: category(5), int32(4), int64(1), int8(1), object(5)
memory usage: 117.5+ MB


In [15]:
merged_data['ua_device_type'].value_counts()

ua_device_type
smartphone    1192721
desktop        463165
tablet         103730
Name: count, dtype: int64

In [16]:
merged_data['ua_client_type'].value_counts()

ua_client_type
mobile app    1140259
browser        619338
av                 19
Name: count, dtype: int64

In [17]:
merged_data['ua_os'].value_counts()

ua_os
Android    1248707
Windows     398698
Mac          55179
iOS          45672
Linux         9306
Other         2054
Name: count, dtype: int64

## One-hot

In [18]:
# One-hot кодирование признаков
merged_data = pd.get_dummies(merged_data, columns=['ua_device_type', 'ua_client_type', 'ua_os'])

# Если вы хотите посмотреть результат
merged_data.head()

Unnamed: 0,region,ua_client_name,total_watchtime,rutube_video_id,viewer_uid,local_event_timestamp,age,sex,age_class,title,...,ua_device_type_tablet,ua_client_type_browser,ua_client_type_mobile app,ua_client_type_av,ua_os_Android,ua_os_Other,ua_os_Linux,ua_os_Mac,ua_os_Windows,ua_os_iOS
0,Chelyabinsk,Yandex Browser,1883,video_133074,10067243,2024-06-01 08:40:58+05:00,20,female,0,Папа с особенностями. Мужское / Женское. Выпус...,...,False,True,False,False,False,False,False,False,True,False
1,Bashkortostan Republic,Rutube,512,video_362960,10245341,2024-06-01 21:33:24+05:00,40,female,2,Comedy Club: Мальдивы | Андрей Бебуришвили,...,False,False,True,False,True,False,False,False,False,False
2,St.-Petersburg,Chrome,5647,video_96775,10894333,2024-06-01 21:30:43+03:00,23,male,1,"Новая Битва экстрасенсов, 24 сезон, 11 выпуск",...,False,True,False,False,False,False,False,False,True,False
3,Moscow,Rutube,1521,video_161610,10029092,2024-06-01 23:03:42+03:00,41,male,3,Сергей Орлов-снял дом!!!,...,False,False,True,False,True,False,False,False,False,False
4,Moscow,Rutube,71,video_116245,10452976,2024-06-01 22:48:09+03:00,38,female,2,Ищем сокровища в Полевском | Уральская Флоренц...,...,False,False,True,False,True,False,False,False,False,False


In [19]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1759616 entries, 0 to 1759615
Data columns (total 25 columns):
 #   Column                     Dtype   
---  ------                     -----   
 0   region                     category
 1   ua_client_name             object  
 2   total_watchtime            int32   
 3   rutube_video_id            object  
 4   viewer_uid                 int32   
 5   local_event_timestamp      object  
 6   age                        int8    
 7   sex                        object  
 8   age_class                  int64   
 9   title                      object  
 10  category                   category
 11  duration                   int32   
 12  author_id                  int32   
 13  ua_device_type_desktop     bool    
 14  ua_device_type_smartphone  bool    
 15  ua_device_type_tablet      bool    
 16  ua_client_type_browser     bool    
 17  ua_client_type_mobile app  bool    
 18  ua_client_type_av          bool    
 19  ua_os_Android        

## Label Encoder

In [24]:
encoder = LabelEncoder()
encoder.fit(merged_data['ua_client_name'])
np.save('ua_client_name.npy', encoder.classes_)

encoder = LabelEncoder()
encoder.fit(merged_data['category'])
np.save('category.npy', encoder.classes_)

In [26]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Создаем объекты LabelEncoder для каждой колонки
category_encoder = LabelEncoder()
ua_client_name_encoder = LabelEncoder()

# Загрузка классов из .npy файлов с использованием allow_pickle=True
category_encoder.classes_ = np.load('/kaggle/working/category.npy', allow_pickle=True)
ua_client_name_encoder.classes_ = np.load('/kaggle/working/ua_client_name.npy', allow_pickle=True)

# Применение энкодеров к соответствующим признакам
merged_data['category'] = category_encoder.transform(merged_data['category'])
merged_data['ua_client_name'] = ua_client_name_encoder.transform(merged_data['ua_client_name'])

In [27]:
merged_data.head()

Unnamed: 0,region,ua_client_name,total_watchtime,rutube_video_id,viewer_uid,local_event_timestamp,age,sex,age_class,title,...,ua_device_type_tablet,ua_client_type_browser,ua_client_type_mobile app,ua_client_type_av,ua_os_Android,ua_os_Other,ua_os_Linux,ua_os_Mac,ua_os_Windows,ua_os_iOS
0,Chelyabinsk,11,1883,video_133074,10067243,2024-06-01 08:40:58+05:00,20,female,0,Папа с особенностями. Мужское / Женское. Выпус...,...,False,True,False,False,False,False,False,False,True,False
1,Bashkortostan Republic,9,512,video_362960,10245341,2024-06-01 21:33:24+05:00,40,female,2,Comedy Club: Мальдивы | Андрей Бебуришвили,...,False,False,True,False,True,False,False,False,False,False
2,St.-Petersburg,0,5647,video_96775,10894333,2024-06-01 21:30:43+03:00,23,male,1,"Новая Битва экстрасенсов, 24 сезон, 11 выпуск",...,False,True,False,False,False,False,False,False,True,False
3,Moscow,9,1521,video_161610,10029092,2024-06-01 23:03:42+03:00,41,male,3,Сергей Орлов-снял дом!!!,...,False,False,True,False,True,False,False,False,False,False
4,Moscow,9,71,video_116245,10452976,2024-06-01 22:48:09+03:00,38,female,2,Ищем сокровища в Полевском | Уральская Флоренц...,...,False,False,True,False,True,False,False,False,False,False


In [28]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1759616 entries, 0 to 1759615
Data columns (total 25 columns):
 #   Column                     Dtype   
---  ------                     -----   
 0   region                     category
 1   ua_client_name             int64   
 2   total_watchtime            int32   
 3   rutube_video_id            object  
 4   viewer_uid                 int32   
 5   local_event_timestamp      object  
 6   age                        int8    
 7   sex                        object  
 8   age_class                  int64   
 9   title                      object  
 10  category                   int64   
 11  duration                   int32   
 12  author_id                  int32   
 13  ua_device_type_desktop     bool    
 14  ua_device_type_smartphone  bool    
 15  ua_device_type_tablet      bool    
 16  ua_client_type_browser     bool    
 17  ua_client_type_mobile app  bool    
 18  ua_client_type_av          bool    
 19  ua_os_Android        

## TimeZones

In [None]:
unique_regions = merged_data['region'].unique().tolist()
print(unique_regions)

In [29]:
region_timezone = {
    'Chelyabinsk': 5,
    'Bashkortostan Republic': 5,
    'St.-Petersburg': 3,
    'Moscow': 3,
    'Rostov': 3,
    'Moscow Oblast': 3,
    'Kursk Oblast': 3,
    'Kemerovo Oblast': 7,
    'Arkhangelskaya': 3,
    'Tomsk Oblast': 7,
    'Novosibirsk Oblast': 7,
    'Sverdlovsk Oblast': 5,
    "Leningradskaya Oblast'": 3,
    'Krasnodar Krai': 3,
    'Tatarstan Republic': 3,
    'Belgorod Oblast': 3,
    'Kuzbass': 7,
    'Udmurtiya Republic': 4,
    'Chuvashia': 3,
    'Ryazan Oblast': 3,
    'Perm Krai': 5,
    'Sakha': 9,
    'Orenburg Oblast': 5,
    'Primorye': 10,
    'Zabaykalskiy (Transbaikal) Kray': 9,
    'Bryansk Oblast': 3,
    'Tver Oblast': 3,
    'Stavropol Kray': 3,
    'Khabarovsk': 10,
    'Penza Oblast': 3,
    'Mariy-El Republic': 3,
    'Smolensk Oblast': 3,
    'Tambov Oblast': 3,
    'Novgorod Oblast': 3,
    'Khakasiya Republic': 7,
    'Ulyanovsk': 4,
    'Volgograd Oblast': 3,
    'Irkutsk Oblast': 8,
    'Komi': 3,
    'Nizhny Novgorod Oblast': 3,
    'Krasnoyarsk Krai': 7,
    'Kurgan Oblast': 5,
    'Kirov Oblast': 3,
    'Omsk Oblast': 6,
    'Vladimir Oblast': 3,
    'Yaroslavl Oblast': 3,
    'Saratov Oblast': 3,
    'Khanty-Mansia': 5,
    'Tula Oblast': 3,
    'Amur Oblast': 9,
    'Altay Kray': 7,
    'Buryatiya Republic': 8,
    'Dagestan': 3,
    'Kaluga Oblast': 3,
    'Kaliningrad Oblast': 2,
    'Murmansk': 3,
    'Samara Oblast': 4,
    'Stavropol’ Kray': 3,
    'Voronezh Oblast': 3,
    'Kursk': 3,
    'Sverdlovsk': 5,
    'Karelia': 3,
    'Lipetsk Oblast': 3,
    'Adygeya Republic': 3,
    'Ivanovo Oblast': 3,
    'Oryol oblast': 3,
    'Tula': 3,
    'Kamchatka': 12,
    'Tyumen Oblast': 5,
    'Krasnodarskiy': 3,
    'Krasnoyarskiy': 7,
    'Pskov Oblast': 3,
    'Crimea': 3,
    'Chechnya': 3,
    'Saratovskaya Oblast': 3,
    'Kalmykiya Republic': 3,
    'North Ossetia–Alania': 3,
    'Vologda Oblast': 3,
    'Karachayevo-Cherkesiya Republic': 3,
    'Voronezj': 3,
    'Chukotka': 12,
    'Mordoviya Republic': 3,
    'Kostroma Oblast': 3,
    'Yamalo-Nenets': 5,
    'Magadan Oblast': 11,
    'Altai': 7,
    'Vladimir': 3,
    'Ivanovo': 3,
    'Astrakhan Oblast': 4,
    'Penza': 3,
    'Kabardino-Balkariya Republic': 3,
    'Jaroslavl': 3,
    'Sakhalin Oblast': 11,
    'Sebastopol City': 3,
    'Tyumen’ Oblast': 5,
    'Kirov': 3,
    'Orel Oblast': 3,
    'Omsk': 6,
    'Smolenskaya Oblast’': 3,
    'Nenets': 3,
    'Tver’ Oblast': 3,
    'Jewish Autonomous Oblast': 9,
    'Ingushetiya Republic': 3,
    'Kaluga': 3,
    'Kaliningrad': 2,
    'North Ossetia': 3,
    'Perm': 5,
    'Smolensk': 3,
    'Primorskiy (Maritime) Kray': 10,
    'Vologda': 3,
    'Stavropol Krai': 3,
    'Astrakhan': 4,
    'Transbaikal Territory': 9,
    'Tambov': 3,
    'Tyva Republic': 7,
    'Arkhangelsk Oblast': 3
}

In [30]:
merged_data['timezone UTC+'] = merged_data['region'].map(region_timezone)
merged_data[['region', 'timezone UTC+']].head()

Unnamed: 0,region,timezone UTC+
0,Chelyabinsk,5
1,Bashkortostan Republic,5
2,St.-Petersburg,3
3,Moscow,3
4,Moscow,3


In [31]:
merged_data = merged_data.drop(['region'], axis=1)

In [32]:
merged_data.sample(10)

Unnamed: 0,ua_client_name,total_watchtime,rutube_video_id,viewer_uid,local_event_timestamp,age,sex,age_class,title,category,...,ua_client_type_browser,ua_client_type_mobile app,ua_client_type_av,ua_os_Android,ua_os_Other,ua_os_Linux,ua_os_Mac,ua_os_Windows,ua_os_iOS,timezone UTC+
779135,9,2019,video_199495,10102374,2024-06-14 13:33:03+03:00,25,female,1,"Верчусь, как могу. Мужское / Женское. Выпуск о...",33,...,False,True,False,True,False,False,False,False,False,3
690483,7,1729,video_113565,10402161,2024-06-12 16:54:30+03:00,35,male,2,Оливейра - Грозин. Хайбула - Дамил. Мирзаев - ...,31,...,True,False,False,False,False,False,False,True,False,3
317787,9,432,video_258123,10174854,2024-06-06 21:51:36+03:00,51,male,3,"""Остановить фашизм!"" Е.Спицын, Д.Новиков, Д.Па...",11,...,False,True,False,True,False,False,False,False,False,3
95583,9,2113,video_474991,10066894,2024-06-02 21:56:28+05:00,27,female,1,"Шоу Воли, 54 выпуск",33,...,False,True,False,True,False,False,False,False,False,4
1163688,9,1690,video_461531,10798105,2024-06-20 17:04:34+04:00,34,male,2,"Люцифер 3 сезон, 18 серия",30,...,False,True,False,True,False,False,False,False,False,3
59283,11,1021,video_96775,10020904,2024-06-02 23:29:00+05:00,24,female,1,"Новая Битва экстрасенсов, 24 сезон, 11 выпуск",33,...,True,False,False,False,False,False,False,True,False,5
646214,11,101,video_455025,10226616,2024-06-11 00:41:43+03:00,44,male,3,8 подруг Оушена | Ocean's Eight (2018),36,...,True,False,False,False,False,False,False,True,False,3
1514653,9,400,video_378391,10029115,2024-06-26 16:04:43+03:00,35,female,2,Свидание под водой | Богиня свиданий | ПРЕМЬЕР...,33,...,False,True,False,True,False,False,False,False,False,3
530081,9,1441,video_320262,10049986,2024-06-10 06:37:24+03:00,33,female,2,Цена ребенка. Мужское / Женское. Выпуск от 06....,33,...,False,True,False,True,False,False,False,False,False,3
1669309,9,1141,video_128965,10028412,2024-06-29 09:32:23+03:00,34,female,2,"Новая Битва экстрасенсов, 24 сезон, 7 выпуск",33,...,False,True,False,True,False,False,False,False,False,3
