In [16]:
import pandas as pd
import re

# Import csv file
file_path = 'russia_losses.csv'

# Read the dataset into a pandas DataFrame
losses = pd.read_csv(file_path)

# Drop the 'lost_by' and 'unit' columns from the DataFrame
losses = losses.drop(columns=['lost_by', 'unit'])

# Display the first few rows of the dataset
losses.head()

Unnamed: 0,id,type,model,status,date,nearest_location,geo,tags
0,1,Tanks,T-64BV,Destroyed,2022-03-14,"Rubizhne, Sievierodonetsk raion","49.027241,38.343374",
1,2,Tanks,T-64BV,Destroyed,2022-03-16,"Mariupol, Mariupol raion","47.099125851628806,37.52371337039075","Turretless, Z, Mine plow/roller"
2,3,Tanks,T-64BV,Destroyed,2022-03-16,"Mariupol, Mariupol raion","47.09869256359657,37.52353235165147",Shattered
3,5,Tanks,T-64BV,Destroyed,2022-03-17,"Mariupol, Mariupol raion","47.098139835697424,37.640174323260645","Turretless, Z"
4,6,Tanks,T-64BV,Destroyed,2022-03-31,"Rubizhne, Sievierodonetsk raion","49.01122,38.39844",Z


In [26]:
import numpy as np

# Split the 'tags' column into multiple columns
tags = losses['tags'].str.split(',', expand=True)

# Flatten the DataFrame, remove None/NaN, strip spaces, and normalize case
tags_flattened = [
    str(tag).strip().lower() for tag in tags.values.ravel(order='K') if pd.notna(tag)
]

# Use numpy.unique for deduplication
tags_unique_list = np.unique(tags_flattened).tolist()

print(tags_unique_list)

['+', 'additional armour', 'armour kit missing', 'cope cage', 'damaged', 'expanded cabin', 'flipped', 'heli', 'improvised', 'jammer', 'kung', 'loitering', 'mine plow/roller', 'o', 'pl-1', 'removed turret', 'shattered', 'stugna', 't', 'turretless', 'turtle', 'underwater', 'v', 'z', '∆', '∇', '∧', '▲', '◇', '☐', '✕', '⧸']


In [28]:
# Convert the unique tags list into a DataFrame
tags_df = pd.DataFrame(tags_unique_list, columns=['tag'])
tags_df.reset_index(inplace=True)
tags_df

Unnamed: 0,index,tag
0,0,+
1,1,additional armour
2,2,armour kit missing
3,3,cope cage
4,4,damaged
5,5,expanded cabin
6,6,flipped
7,7,heli
8,8,improvised
9,9,jammer


In [33]:
tags_df.to_csv('tag_idex.csv', index=False, encoding='utf-8')

In [30]:
# Create a mapping of tags to their indices
tag_to_index = {tag: idx for idx, tag in tags_df[['index', 'tag']].values}

# Replace each tag with its corresponding index
def replace_tags_with_indices(tag_string):
    if pd.notna(tag_string):  # Check for non-NaN values
        tags = [tag.strip().lower() for tag in tag_string.split(',')]  # Normalize tags
        return ','.join(str(tag_to_index[tag]) for tag in tags if tag in tag_to_index)
    return tag_string  # Return original value if NaN

# Apply the transformation to the 'tags' column
losses['tags'] = losses['tags'].apply(replace_tags_with_indices)

# Display the first few rows of the updated DataFrame
print(losses.head())

   id   type   model     status        date                 nearest_location  \
0   1  Tanks  T-64BV  Destroyed  2022-03-14  Rubizhne, Sievierodonetsk raion   
1   2  Tanks  T-64BV  Destroyed  2022-03-16         Mariupol, Mariupol raion   
2   3  Tanks  T-64BV  Destroyed  2022-03-16         Mariupol, Mariupol raion   
3   5  Tanks  T-64BV  Destroyed  2022-03-17         Mariupol, Mariupol raion   
4   6  Tanks  T-64BV  Destroyed  2022-03-31  Rubizhne, Sievierodonetsk raion   

                                     geo      tags  
0                    49.027241,38.343374       NaN  
1   47.099125851628806,37.52371337039075  19,23,12  
2    47.09869256359657,37.52353235165147        16  
3  47.098139835697424,37.640174323260645     19,23  
4                      49.01122,38.39844        23  


In [31]:
# debuging code to display selected rows 
losses.iloc[14150:14155]

Unnamed: 0,id,type,model,status,date,nearest_location,geo,tags
14150,25886,Infantry fighting vehicles,MT-LB,Destroyed,2024-04-18,"Volodymyrivka, Volnovakha raion","47.779417,37.417667",35
14151,25887,Infantry fighting vehicles,MT-LB,Destroyed,2024-04-18,"Volodymyrivka, Volnovakha raion","47.779611,37.418556",5
14152,25888,Tanks,T-62M,Abandoned,2024-04-18,"Volodymyrivka, Volnovakha raion","47.779177,37.417521",221234291
14153,25889,Tanks,T-72B Obr. 2022,Abandoned,2024-03-26,"Terny, Kramatorsk raion","49.0919489,38.0292211",23132234
14154,25890,Infantry fighting vehicles,BMP-2(K),Captured,2022-07-23,"Ivanivka, Beryslav raion","47.475077,33.386141",23
