In [1]:
import pandas as pd
import re

# Import csv file
file_path = 'russia_losses.csv'

# Read the dataset into a pandas DataFrame
losses_df = pd.read_csv(file_path)

# Display the first few rows of the dataset
losses_df.head()

Unnamed: 0,id,type,model,status,lost_by,date,nearest_location,geo,unit,tags
0,1,Tanks,T-64BV,Destroyed,Russia,2022-03-14,"Rubizhne, Sievierodonetsk raion","49.027241,38.343374",,
1,2,Tanks,T-64BV,Destroyed,Russia,2022-03-16,"Mariupol, Mariupol raion","47.099125851628806,37.52371337039075",,"Turretless, Z, Mine plow/roller"
2,3,Tanks,T-64BV,Destroyed,Russia,2022-03-16,"Mariupol, Mariupol raion","47.09869256359657,37.52353235165147",,Shattered
3,5,Tanks,T-64BV,Destroyed,Russia,2022-03-17,"Mariupol, Mariupol raion","47.098139835697424,37.640174323260645",,"Turretless, Z"
4,6,Tanks,T-64BV,Destroyed,Russia,2022-03-31,"Rubizhne, Sievierodonetsk raion","49.01122,38.39844",,Z


In [2]:
equipment_df = losses_df[['type', 'model']]
equipment_df.columns = ['Category', 'Equipment']
equipment_df

Unnamed: 0,Category,Equipment
0,Tanks,T-64BV
1,Tanks,T-64BV
2,Tanks,T-64BV
3,Tanks,T-64BV
4,Tanks,T-64BV
...,...,...
18377,Infantry fighting vehicles,BMD-2
18378,Infantry fighting vehicles,BMP-1AM 675-sb3KDZ
18379,Infantry fighting vehicles,BMP-3
18380,Infantry fighting vehicles,BMP-2(K)


In [3]:
# Drop rows containing "Unknown" (case-insensitive)
equipment_df = equipment_df[~equipment_df['Equipment'].str.contains('Unknown', case=False, na=False)].copy()
equipment_df

Unnamed: 0,Category,Equipment
0,Tanks,T-64BV
1,Tanks,T-64BV
2,Tanks,T-64BV
3,Tanks,T-64BV
4,Tanks,T-64BV
...,...,...
18376,Infantry fighting vehicles,BTR-D
18377,Infantry fighting vehicles,BMD-2
18378,Infantry fighting vehicles,BMP-1AM 675-sb3KDZ
18379,Infantry fighting vehicles,BMP-3


In [4]:
# Keep only the text after the first space in each row (drop the count number)
# equipment_df.loc[:, 'Equipment'] = equipment_df['Equipment'].str.split(' ', n=1).str[1]

# Keep only the text before the first space in each row
equipment_df.loc[:, 'Equipment'] = equipment_df['Equipment'].str.split(' ', n=1).str[0]
# Drops the asterics " ' "
equipment_df.loc[:,'Equipment'] = equipment_df['Equipment'].str.strip("'")

# Remove duplicate values in the 'Equipment' column, 
clean_equipment_df = equipment_df.drop_duplicates(subset=['Equipment'], keep='first')


In [5]:
clean_equipment_df

Unnamed: 0,Category,Equipment
0,Tanks,T-64BV
8,Tanks,T-64A
13,Tanks,T-72A
19,Tanks,T-72B
28,Tanks,T-72AV
...,...,...
17336,Infantry fighting vehicles,BRDM-2MS
17426,Infantry mobility vehicles,STS
17603,Transport,GAZ-S41A23
18308,"Radars, jammers",55Zh6U


In [6]:
# Load the old equipment data
equipment_path = 'equipment_by_category.csv'
old_equipment_df = pd.read_csv(equipment_path)

# Concatenate the new equipment data to the old equipment data
combined_equipment_df = pd.concat([old_equipment_df, clean_equipment_df])

# Remove duplicates based on the 'Equipment' column, keeping the first occurrence
updated_equipment_df = combined_equipment_df.drop_duplicates(subset='Equipment', keep='first')
updated_equipment_df = updated_equipment_df.sort_values(by='Category', ascending=True)

# Display the result
print(updated_equipment_df.head())

print(updated_equipment_df.info())

        Category  Equipment
18318  Airplanes     Su-27P
105    Airplanes     Su-24M
106    Airplanes    Su-24MR
107    Airplanes  Su-24M/MR
108    Airplanes      Su-27


In [None]:
# Save the updated DataFrame back to a CSV file if needed
updated_equipment_df.to_csv("new_equipment_by_category.csv", index=False)