In [1]:
import pandas as pd
import re

# Import csv file
file_path = 'russia_losses.csv'

# Read the dataset into a pandas DataFrame
losses = pd.read_csv(file_path)

# Display the first few rows of the dataset
losses.head()

Unnamed: 0,id,type,model,status,lost_by,date,nearest_location,geo,unit,tags
0,1,Tanks,T-64BV,Destroyed,Russia,2022-03-14,"Rubizhne, Sievierodonetsk raion","49.027241,38.343374",,
1,2,Tanks,T-64BV,Destroyed,Russia,2022-03-16,"Mariupol, Mariupol raion","47.099125851628806,37.52371337039075",,"Turretless, Z, Mine plow/roller"
2,3,Tanks,T-64BV,Destroyed,Russia,2022-03-16,"Mariupol, Mariupol raion","47.09869256359657,37.52353235165147",,Shattered
3,5,Tanks,T-64BV,Destroyed,Russia,2022-03-17,"Mariupol, Mariupol raion","47.098139835697424,37.640174323260645",,"Turretless, Z"
4,6,Tanks,T-64BV,Destroyed,Russia,2022-03-31,"Rubizhne, Sievierodonetsk raion","49.01122,38.39844",,Z


In [2]:
import numpy as np

# Split the 'tags' column into multiple columns
tags = losses['tags'].str.split(',', expand=True)

# Flatten the DataFrame, remove None/NaN, strip spaces, and normalize case
tags_flattened = [
    str(tag).strip().lower() for tag in tags.values.ravel(order='K') if pd.notna(tag)
]

# Use numpy.unique for deduplication
tags_unique_list = np.unique(tags_flattened).tolist()

print(tags_unique_list)

['+', 'additional armour', 'armour kit missing', 'cope cage', 'damaged', 'expanded cabin', 'flipped', 'heli', 'improvised', 'jammer', 'kung', 'loitering', 'mine plow/roller', 'o', 'pl-1', 'removed turret', 'shattered', 'stugna', 't', 'turretless', 'turtle', 'underwater', 'v', 'z', '∆', '∇', '∧', '▲', '◇', '☐', '✕', '⧸']


In [3]:
# Convert the unique tags list into a DataFrame
tags_df = pd.DataFrame(tags_unique_list, columns=['tag'])
tags_df.reset_index(inplace=True)
tags_df

Unnamed: 0,index,tag
0,0,+
1,1,additional armour
2,2,armour kit missing
3,3,cope cage
4,4,damaged
5,5,expanded cabin
6,6,flipped
7,7,heli
8,8,improvised
9,9,jammer


In [4]:
# convert index integer into a "four character" unique value, to prevent false matches during searches
tags_df['index'] = tags_df['index']-960
tags_df.columns = ['id', 'tag']
tags_df

Unnamed: 0,id,tag
0,-960,+
1,-959,additional armour
2,-958,armour kit missing
3,-957,cope cage
4,-956,damaged
5,-955,expanded cabin
6,-954,flipped
7,-953,heli
8,-952,improvised
9,-951,jammer


In [5]:
# save tags_df as tags_index.csv
tags_df.to_csv('tags_index.csv', index=False, encoding='utf-8')

In [6]:
# Create a mapping of tags to their indices
tag_to_index = {tag: idx for idx, tag in tags_df[['id', 'tag']].values}

print(tag_to_index)

# Replace each tag with its corresponding index
def replace_tags_with_indices(tag_string):
    if pd.notna(tag_string):  # Check for non-NaN values
        tags = [tag.strip().lower() for tag in tag_string.split(',')]  # Normalize tags
        return ','.join(str(tag_to_index[tag]) for tag in tags if tag in tag_to_index)
    return tag_string  # Return original value if NaN

# Apply the transformation to the 'tags' column
losses['tags'] = losses['tags'].apply(replace_tags_with_indices)

# Display the first few rows of the updated DataFrame
print(losses.head())

{'+': -960, 'additional armour': -959, 'armour kit missing': -958, 'cope cage': -957, 'damaged': -956, 'expanded cabin': -955, 'flipped': -954, 'heli': -953, 'improvised': -952, 'jammer': -951, 'kung': -950, 'loitering': -949, 'mine plow/roller': -948, 'o': -947, 'pl-1': -946, 'removed turret': -945, 'shattered': -944, 'stugna': -943, 't': -942, 'turretless': -941, 'turtle': -940, 'underwater': -939, 'v': -938, 'z': -937, '∆': -936, '∇': -935, '∧': -934, '▲': -933, '◇': -932, '☐': -931, '✕': -930, '⧸': -929}
   id   type   model     status lost_by        date  \
0   1  Tanks  T-64BV  Destroyed  Russia  2022-03-14   
1   2  Tanks  T-64BV  Destroyed  Russia  2022-03-16   
2   3  Tanks  T-64BV  Destroyed  Russia  2022-03-16   
3   5  Tanks  T-64BV  Destroyed  Russia  2022-03-17   
4   6  Tanks  T-64BV  Destroyed  Russia  2022-03-31   

                  nearest_location                                    geo  \
0  Rubizhne, Sievierodonetsk raion                    49.027241,38.343374   
1

In [7]:
# debuging code to display selected rows 
losses.iloc[14150:14155]

Unnamed: 0,id,type,model,status,lost_by,date,nearest_location,geo,unit,tags
14150,25886,Infantry fighting vehicles,MT-LB,Destroyed,Russia,2024-04-18,"Volodymyrivka, Volnovakha raion","47.779417,37.417667",,"-957,-955"
14151,25887,Infantry fighting vehicles,MT-LB,Destroyed,Russia,2024-04-18,"Volodymyrivka, Volnovakha raion","47.779611,37.418556",,-955
14152,25888,Tanks,T-62M,Abandoned,Russia,2024-04-18,"Volodymyrivka, Volnovakha raion","47.779177,37.417521",,"-938,-948,-957,-956,-931,-959"
14153,25889,Tanks,T-72B Obr. 2022,Abandoned,Russia,2024-03-26,"Terny, Kramatorsk raion","49.0919489,38.0292211",20th Combined Arms Army,"-937,-947,-938,-957,-956"
14154,25890,Infantry fighting vehicles,BMP-2(K),Captured,Russia,2022-07-23,"Ivanivka, Beryslav raion","47.475077,33.386141",,-937


In [8]:
losses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18382 entries, 0 to 18381
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                18382 non-null  int64 
 1   type              18382 non-null  object
 2   model             18382 non-null  object
 3   status            18382 non-null  object
 4   lost_by           18382 non-null  object
 5   date              18382 non-null  object
 6   nearest_location  17316 non-null  object
 7   geo               10714 non-null  object
 8   unit              3566 non-null   object
 9   tags              11336 non-null  object
dtypes: int64(1), object(9)
memory usage: 1.4+ MB
