In [61]:
import pandas as pd

### Загружаем данные после аналитики:
* удаляем столбцы at_least_one, at_least_two, at_least_three и age_group (признак age уже есть)
* кодируем city_id по методы onehotencoding

In [None]:
user_features_df = pd.read_csv(r'..\analysis\user_features.csv').iloc[:, 1: -3] #берём подготовленные признаки, кроме at_least_one	at_least_two	at_least_three
user_features_df.user_id = user_features_df.user_id.astype(int)
city_groups = pd.get_dummies(user_features_df['city_group'], prefix='city')
user_features_df = pd.concat([user_features_df, city_groups], axis=1)
user_features_df = user_features_df.drop(['total_ads', 'city_id', 'age_group'], axis = 1) 

user_features_df

Unnamed: 0,user_id,avg_cpm,median_cpm,max_cpm,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,...,city_17,city_19,city_201,city_21,city_25,city_3,city_60,city_7,city_8,city_other
0,1,111.247848,90.000,361.52,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,3,321.588571,263.760,531.22,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,4,45.086429,30.245,255.36,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,5,91.055556,42.500,390.00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,6,226.118857,234.000,496.95,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24855,27764,179.386098,170.000,400.92,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
24856,27765,130.001333,110.640,455.00,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
24857,27766,138.974231,120.000,342.00,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
24858,27767,143.880000,143.880,143.88,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Загружаем данные с номерами площадок и id пользователей:
* посчитаем сколько показов на каждого клиента приходится на каждой площадке
* перекодируем признок номера площадки для рекламы по OneHotEncoding
* соединим таблицы признаками пользователя и количеством реклам, которые он видел 

In [72]:
validate_df = pd.read_csv(r'..\src\validate.tsv', sep = '\t')
validate_df.head()

Unnamed: 0,cpm,hour_start,hour_end,publishers,audience_size,user_ids
0,220.0,1058,1153,717,1906,"12,44,46,50,58,71,93,122,134,143,176,184,187,1..."
1,312.0,1295,1301,318,1380,"29,81,98,102,165,167,195,205,218,231,242,263,3..."
2,70.0,1229,1249,12391521,888,"12,23,25,29,45,85,92,124,156,190,272,334,456,5..."
3,240.0,1295,1377,114,440,"44,122,187,209,242,255,312,345,382,465,513,524..."
4,262.0,752,990,1378,1476,"15,24,30,43,50,53,96,105,159,168,181,190,196,2..."


In [None]:
# Функция для создания комбинаций user_id и publisher
def create_combinations(row):
    user_ids = row['user_ids'].split(',')
    publishers = row['publishers'].split(',')
    return [(user_id, publisher) for user_id in user_ids for publisher in publishers]
# Создаем новый DataFrame с комбинациями
combinations = validate_df.apply(create_combinations, axis=1)
# Разворачиваем список комбинаций в отдельные строки
combinations_exploded = combinations.explode().dropna()
# Преобразуем в DataFrame
result_df = pd.DataFrame(combinations_exploded.tolist(), columns=['user_id', 'publisher'])
# Считаем количество уникальных комбинаций
result = result_df.groupby(['user_id', 'publisher']).size().reset_index(name='count')

print(result)

       user_id publisher  count
0            0         1     28
1            0        10      7
2            0        11      6
3            0        12      5
4            0        13      8
...        ...       ...    ...
554532    9999         4      4
554533    9999         5      6
554534    9999         7     16
554535    9999         8      2
554536    9999         9      2

[554537 rows x 3 columns]


In [65]:
# Применяем OneHotEncoding к колонке publisher
result_one_hot = pd.get_dummies(result['publisher'], prefix='publisher')
result_one_hot = pd.concat([result[['user_id', 'count']], result_one_hot], axis = 1)
result_one_hot.user_id = result_one_hot.user_id.astype(int)
result_one_hot.head()

Unnamed: 0,user_id,count,publisher_1,publisher_10,publisher_11,publisher_12,publisher_13,publisher_14,publisher_15,publisher_16,...,publisher_2,publisher_20,publisher_21,publisher_3,publisher_4,publisher_5,publisher_6,publisher_7,publisher_8,publisher_9
0,0,28,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,7,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,6,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,5,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,8,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [73]:
merged_data = result_one_hot.merge(user_features_df, on = 'user_id', how = 'inner').drop('user_id', axis = 1)
merged_data.head()

Unnamed: 0,count,publisher_1,publisher_10,publisher_11,publisher_12,publisher_13,publisher_14,publisher_15,publisher_16,publisher_17,...,city_17,city_19,city_201,city_21,city_25,city_3,city_60,city_7,city_8,city_other
0,23,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,6,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [74]:
merged_data.to_csv('data_to_train.csv', index=False)

### Теперь count это таргет, который мы хотим предсказать - это количество рекламных объявлений, которые видел конкретный пользователь на определённой площадке publisher_n, где n - это номер площадки