In [1]:
import pandas as pd
import numpy as np
import ast
import os
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive/')

Функции для извлечения признаков из списков.

In [13]:
def get_objectives_0(objectives):
    result = {}
    for k, v in objectives.items():
        result[f'{k}_0'] = v
    return result

def get_objectives_1(objectives):
    result = {}
    for k, v in objectives.items():
        result[f'{k}_1'] = v
    return result


def get_players(players):
    result = {}
    for i, player in enumerate(players, 1):
        for k, v in player.items():
            result[f'{k}_{i}'] = v
    return result

def preprocess_df(df):
    df.dropna(inplace=True)

    df['players'] = df['players'].apply(ast.literal_eval)

    players_df = df['players'].apply(get_players).apply(pd.Series)
    df = pd.concat([df, players_df], axis=1)

    df['objectives_team0'] = df['objectives_team0'].apply(ast.literal_eval)
    df['objectives_team1'] = df['objectives_team1'].apply(ast.literal_eval)

    mask_columns0 = df['objectives_team0'].apply(get_objectives_0).apply(pd.Series)
    mask_columns1 = df['objectives_team1'].apply(get_objectives_1).apply(pd.Series)
    df = pd.concat([df, mask_columns0, mask_columns1], axis=1)
    features = ['match_id', 'winning_team', 'net_worth_team_0', 'net_worth_team_1',
       'match_score', 'region_mode'] + [f'account_id_{i}' for i in range(1, 13)] + [f'hero_id_{i}' for i in range(1, 13)] + [
       'core_0', 'tier1_lane1_0', 'tier1_lane2_0', 'tier1_lane3_0', 'tier1_lane4_0', 'tier2_lane1_0',
       'tier2_lane2_0', 'tier2_lane3_0', 'tier2_lane4_0', 'titan_0', 'titan_shield_generator_1_0',
       'titan_shield_generator_2_0', 'barrack_boss_lane1_0',
       'barrack_boss_lane2_0', 'barrack_boss_lane3_0', 'barrack_boss_lane4_0',
       'core_1', 'tier1_lane1_1', 'tier1_lane2_1', 'tier1_lane3_1',
       'tier1_lane4_1', 'tier2_lane1_1', 'tier2_lane2_1', 'tier2_lane3_1',
       'tier2_lane4_1', 'titan_1', 'titan_shield_generator_1_1',
       'titan_shield_generator_2_1', 'barrack_boss_lane1_1',
       'barrack_boss_lane2_1', 'barrack_boss_lane3_1', 'barrack_boss_lane4_1']
    df = df[features]

    return df

Считаем датасет и избавимся от дубликатов.

In [4]:
df = pd.read_csv('/content/drive/MyDrive/deadlock_data/combined_matches.csv')
df.drop_duplicates(inplace=True)
df

Unnamed: 0,match_id,start_time,scraped_at,winning_team,players,lobby_id,net_worth_team_0,net_worth_team_1,duration_s,spectators,...,match_mode,game_mode,match_score,region_mode,objectives_mask_team0,objectives_mask_team1,compat_version,ranked_badge_level,objectives_team0,objectives_team1
0,25516143,2024-10-29T20:27:32Z,2024-10-29T20:28:51Z,0,"[{'account_id': 970142860, 'team': 0, 'abandon...",173143636656005517,825,825,0,0,...,Unranked,Normal,1937,Russia,65535,65535,5315.0,0.0,"{'core': True, 'tier1_lane1': True, 'tier1_lan...","{'core': True, 'tier1_lane1': True, 'tier1_lan..."
1,25516143,2024-10-29T20:27:32Z,2024-10-29T20:29:12Z,0,"[{'account_id': 970142860, 'team': 0, 'abandon...",173143636656005517,3076,2141,0,0,...,Unranked,Normal,1937,Russia,65535,65535,5315.0,0.0,"{'core': True, 'tier1_lane1': True, 'tier1_lan...","{'core': True, 'tier1_lane1': True, 'tier1_lan..."
2,25516143,2024-10-29T20:27:32Z,2024-10-29T20:29:55Z,0,"[{'account_id': 970142860, 'team': 0, 'abandon...",173143636656005517,3834,4771,0,0,...,Unranked,Normal,1937,Russia,65535,65535,5315.0,0.0,"{'core': True, 'tier1_lane1': True, 'tier1_lan...","{'core': True, 'tier1_lane1': True, 'tier1_lan..."
3,25516143,2024-10-29T20:27:32Z,2024-10-29T20:30:16Z,0,"[{'account_id': 970142860, 'team': 0, 'abandon...",173143636656005517,6096,6300,0,0,...,Unranked,Normal,1937,Russia,65535,65535,5315.0,0.0,"{'core': True, 'tier1_lane1': True, 'tier1_lan...","{'core': True, 'tier1_lane1': True, 'tier1_lan..."
4,25516143,2024-10-29T20:27:32Z,2024-10-29T20:30:57Z,0,"[{'account_id': 970142860, 'team': 0, 'abandon...",173143636656005517,7268,8443,0,0,...,Unranked,Normal,1937,Russia,65535,65535,5315.0,0.0,"{'core': True, 'tier1_lane1': True, 'tier1_lan...","{'core': True, 'tier1_lane1': True, 'tier1_lan..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5936408,25079589,2024-10-27T08:38:15Z,2024-10-27T09:04:15Z,0,"[{'account_id': 129478213, 'team': 0, 'abandon...",101082389698061514,155106,143722,0,0,...,Ranked,Normal,1842,Oceania,65377,65089,5311.0,96.0,"{'core': True, 'tier1_lane1': False, 'tier1_la...","{'core': True, 'tier1_lane1': False, 'tier1_la..."
5936409,25079589,2024-10-27T08:38:15Z,2024-10-27T09:04:45Z,0,"[{'account_id': 129478213, 'team': 0, 'abandon...",101082389698061514,157687,146935,0,0,...,Ranked,Normal,1842,Oceania,65377,65089,5311.0,96.0,"{'core': True, 'tier1_lane1': False, 'tier1_la...","{'core': True, 'tier1_lane1': False, 'tier1_la..."
5936410,25079589,2024-10-27T08:38:15Z,2024-10-27T09:05:06Z,0,"[{'account_id': 129478213, 'team': 0, 'abandon...",101082389698061514,161092,147691,0,0,...,Ranked,Normal,1842,Oceania,65377,65089,5311.0,96.0,"{'core': True, 'tier1_lane1': False, 'tier1_la...","{'core': True, 'tier1_lane1': False, 'tier1_la..."
5936411,25079589,2024-10-27T08:38:15Z,2024-10-27T09:05:27Z,0,"[{'account_id': 129478213, 'team': 0, 'abandon...",101082389698061514,169598,152655,0,0,...,Ranked,Normal,1842,Oceania,65377,65025,5311.0,96.0,"{'core': True, 'tier1_lane1': False, 'tier1_la...","{'core': True, 'tier1_lane1': False, 'tier1_la..."


match_results.csv - csv файл с match_id и результатом матча.

С помощью этого файла заполняем пустую колонку в этом датасете.

Избавляемся от колонок с NaN значениями и сохраняем датасет.

In [5]:
match_results = pd.read_csv('/content/drive/MyDrive/deadlock_data/match_results.csv').set_index('Unnamed: 0').to_dict()['0']
df['winning_team'] = df['match_id'].map(match_results)
df = df.drop(['compat_version', 'ranked_badge_level'], axis=1)
df.to_csv('clean_data.csv', index=False)
df

Unnamed: 0,match_id,start_time,scraped_at,winning_team,players,lobby_id,net_worth_team_0,net_worth_team_1,duration_s,spectators,open_spectator_slots,match_mode,game_mode,match_score,region_mode,objectives_mask_team0,objectives_mask_team1,objectives_team0,objectives_team1
0,25516143,2024-10-29T20:27:32Z,2024-10-29T20:28:51Z,0.0,"[{'account_id': 970142860, 'team': 0, 'abandon...",173143636656005517,825,825,0,0,10,Unranked,Normal,1937,Russia,65535,65535,"{'core': True, 'tier1_lane1': True, 'tier1_lan...","{'core': True, 'tier1_lane1': True, 'tier1_lan..."
1,25516143,2024-10-29T20:27:32Z,2024-10-29T20:29:12Z,0.0,"[{'account_id': 970142860, 'team': 0, 'abandon...",173143636656005517,3076,2141,0,0,10,Unranked,Normal,1937,Russia,65535,65535,"{'core': True, 'tier1_lane1': True, 'tier1_lan...","{'core': True, 'tier1_lane1': True, 'tier1_lan..."
2,25516143,2024-10-29T20:27:32Z,2024-10-29T20:29:55Z,0.0,"[{'account_id': 970142860, 'team': 0, 'abandon...",173143636656005517,3834,4771,0,0,10,Unranked,Normal,1937,Russia,65535,65535,"{'core': True, 'tier1_lane1': True, 'tier1_lan...","{'core': True, 'tier1_lane1': True, 'tier1_lan..."
3,25516143,2024-10-29T20:27:32Z,2024-10-29T20:30:16Z,0.0,"[{'account_id': 970142860, 'team': 0, 'abandon...",173143636656005517,6096,6300,0,0,10,Unranked,Normal,1937,Russia,65535,65535,"{'core': True, 'tier1_lane1': True, 'tier1_lan...","{'core': True, 'tier1_lane1': True, 'tier1_lan..."
4,25516143,2024-10-29T20:27:32Z,2024-10-29T20:30:57Z,0.0,"[{'account_id': 970142860, 'team': 0, 'abandon...",173143636656005517,7268,8443,0,0,10,Unranked,Normal,1937,Russia,65535,65535,"{'core': True, 'tier1_lane1': True, 'tier1_lan...","{'core': True, 'tier1_lane1': True, 'tier1_lan..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5936408,25079589,2024-10-27T08:38:15Z,2024-10-27T09:04:15Z,0.0,"[{'account_id': 129478213, 'team': 0, 'abandon...",101082389698061514,155106,143722,0,0,10,Ranked,Normal,1842,Oceania,65377,65089,"{'core': True, 'tier1_lane1': False, 'tier1_la...","{'core': True, 'tier1_lane1': False, 'tier1_la..."
5936409,25079589,2024-10-27T08:38:15Z,2024-10-27T09:04:45Z,0.0,"[{'account_id': 129478213, 'team': 0, 'abandon...",101082389698061514,157687,146935,0,0,10,Ranked,Normal,1842,Oceania,65377,65089,"{'core': True, 'tier1_lane1': False, 'tier1_la...","{'core': True, 'tier1_lane1': False, 'tier1_la..."
5936410,25079589,2024-10-27T08:38:15Z,2024-10-27T09:05:06Z,0.0,"[{'account_id': 129478213, 'team': 0, 'abandon...",101082389698061514,161092,147691,0,0,10,Ranked,Normal,1842,Oceania,65377,65089,"{'core': True, 'tier1_lane1': False, 'tier1_la...","{'core': True, 'tier1_lane1': False, 'tier1_la..."
5936411,25079589,2024-10-27T08:38:15Z,2024-10-27T09:05:27Z,0.0,"[{'account_id': 129478213, 'team': 0, 'abandon...",101082389698061514,169598,152655,0,0,10,Ranked,Normal,1842,Oceania,65377,65025,"{'core': True, 'tier1_lane1': False, 'tier1_la...","{'core': True, 'tier1_lane1': False, 'tier1_la..."


Считываем данные не сразу, а по частям, чтобы не перегружать ОЗУ.

In [14]:
chunk_size = 100_000
data_chunks = []
i = 0
for chunk in tqdm(pd.read_csv('clean_data.csv', chunksize=chunk_size), total=26):
    processed_chunk = preprocess_df(chunk)
    data_chunks.append(processed_chunk)

26it [1:09:49, 161.14s/it]


Соединим наши части в один DataFrame и посмотрим на его содержимое.

In [16]:
processed_df = pd.concat(data_chunks, ignore_index=True)
processed_df

Unnamed: 0,match_id,winning_team,net_worth_team_0,net_worth_team_1,match_score,region_mode,account_id_1,account_id_2,account_id_3,account_id_4,...,tier2_lane2_1,tier2_lane3_1,tier2_lane4_1,titan_1,titan_shield_generator_1_1,titan_shield_generator_2_1,barrack_boss_lane1_1,barrack_boss_lane2_1,barrack_boss_lane3_1,barrack_boss_lane4_1
0,25516143,0.0,825,825,1937,Russia,970142860,70208588,81570546,78645019,...,True,True,True,True,True,True,True,True,True,True
1,25516143,0.0,3076,2141,1937,Russia,970142860,70208588,81570546,78645019,...,True,True,True,True,True,True,True,True,True,True
2,25516143,0.0,3834,4771,1937,Russia,970142860,70208588,81570546,78645019,...,True,True,True,True,True,True,True,True,True,True
3,25516143,0.0,6096,6300,1937,Russia,970142860,70208588,81570546,78645019,...,True,True,True,True,True,True,True,True,True,True
4,25516143,0.0,7268,8443,1937,Russia,970142860,70208588,81570546,78645019,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542705,25079589,0.0,155106,143722,1842,Oceania,129478213,254996532,345930392,239959825,...,True,False,False,True,True,True,True,True,True,True
2542706,25079589,0.0,157687,146935,1842,Oceania,129478213,254996532,345930392,239959825,...,True,False,False,True,True,True,True,True,True,True
2542707,25079589,0.0,161092,147691,1842,Oceania,129478213,254996532,345930392,239959825,...,True,False,False,True,True,True,True,True,True,True
2542708,25079589,0.0,169598,152655,1842,Oceania,129478213,254996532,345930392,239959825,...,False,False,False,True,True,True,True,True,True,True


Датасет стал более компактным и удобным. Удаляем дубликаты и сохраняем итоговый датасет.

In [None]:
processed_df.drop_duplicates(inplace=True)
processed_df.to_csv('/content/drive/MyDrive/deadlock_data/clean_data.csv', index=False)