In [14]:
import pandas as pd
import re

# Import csv file
file_path = 'russia_losses.csv'

# Read the dataset into a pandas DataFrame
losses_df = pd.read_csv(file_path)

# Drop the 'lost_by' and 'unit' columns from the DataFrame
losses_df = losses_df.drop(columns=['lost_by', 'unit'])
# Convert the 'date' column to datetime format
losses_df['date'] = pd.to_datetime(losses_df['date'], errors='coerce')

# Display the updated DataFrame info to verify the change
print(losses_df.info())

# Display the first few rows of the dataset
losses_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18382 entries, 0 to 18381
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                18382 non-null  int64         
 1   type              18382 non-null  object        
 2   model             18382 non-null  object        
 3   status            18382 non-null  object        
 4   date              18382 non-null  datetime64[ns]
 5   nearest_location  17316 non-null  object        
 6   geo               10714 non-null  object        
 7   tags              11336 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 1.1+ MB
None


Unnamed: 0,id,type,model,status,date,nearest_location,geo,tags
0,1,Tanks,T-64BV,Destroyed,2022-03-14,"Rubizhne, Sievierodonetsk raion","49.027241,38.343374",
1,2,Tanks,T-64BV,Destroyed,2022-03-16,"Mariupol, Mariupol raion","47.099125851628806,37.52371337039075","Turretless, Z, Mine plow/roller"
2,3,Tanks,T-64BV,Destroyed,2022-03-16,"Mariupol, Mariupol raion","47.09869256359657,37.52353235165147",Shattered
3,5,Tanks,T-64BV,Destroyed,2022-03-17,"Mariupol, Mariupol raion","47.098139835697424,37.640174323260645","Turretless, Z"
4,6,Tanks,T-64BV,Destroyed,2022-03-31,"Rubizhne, Sievierodonetsk raion","49.01122,38.39844",Z


In [15]:
# Add new empty columns to the DataFrame
losses_df['town'] = None
losses_df['raion'] = None
losses_df['oblast'] = None
losses_df['front'] = None

# Extract text inside parentheses and create a new column 'extra'
losses_df['extra'] = losses_df['nearest_location'].map(
    lambda x: re.findall(r'\((.*?)\)', str(x))[0] if pd.notna(x) and re.search(r'\(.*?\)', str(x)) else None)

# Remove the extracted value from the "nearest_location" column
losses_df['nearest_location'] = losses_df['nearest_location'].apply(lambda x: re.sub(r'\(.*?\)', '', str(x)) if pd.notna(x) else x)
# losses_df.head()

In [16]:
# Split the 'nearest_location' column into parts, location[0] and location[1]
location = losses_df['nearest_location'].str.split(',', expand=True)

    # Process location[0] 
losses_df['raion'] = location[0].apply(
    lambda x: x if pd.notna(x) and ' raion' in str(x).lower() else None).str.lstrip()
losses_df['oblast'] = location[0].apply(
    lambda x: x if pd.notna(x) and (' oblast' in str(x).lower() or ' krai' in str(x).lower() or ' crimea ' in str(x).lower() or ' sea' in str(x).lower()) else None).str.lstrip()
losses_df['front'] = location[0].apply(
    lambda x: x if pd.notna(x) and ('ukraine' in str(x).lower() or 'russia' in str(x).lower()) else None).str.lstrip()
losses_df['town'] = location[0].apply(
    lambda x: x if pd.notna(x) and ' raion' not in str(x).lower() and ' oblast' not in str(x).lower() and ' krai' not in str(x).lower() and 'ukraine' not in str(x).lower() and 'russia' not in str(x).lower() else None).str.lstrip()

    # Process location[1] 
losses_df['raion'] = losses_df['raion'].combine_first(location[1].apply(
    lambda x: x if pd.notna(x) and ' raion' in str(x).lower() else None)).str.lstrip()
losses_df['oblast'] = losses_df['oblast'].combine_first(location[1].apply(
    lambda x: x if pd.notna(x) and (' oblast' in str(x).lower() or ' krai' in str(x).lower() or ' crimea' in str(x).lower() or ' sea' in str(x).lower()) else None)).str.lstrip()
losses_df['front'] = losses_df['front'].combine_first(location[1].apply(
    lambda x: x if pd.notna(x) and ('east ukraine' in str(x).lower() or 'russia' in str(x).lower()) else None)).str.lstrip()

    # Process column 'extra'  
losses_df['oblast'] = losses_df['oblast'].combine_first(losses_df['extra'].apply(
    lambda x: x if pd.notna(x) and (' oblast' in str(x).lower() or ' krai' in str(x).lower()) else None)).str.lstrip()

# Split 'geo' column values "{latitude, longitude}"
losses_df[['lat', 'lon']] = losses_df['geo'].str.split(',', expand=True)

# Convert the extracted columns to numeric values (if necessary)
losses_df['lat'] = pd.to_numeric(losses_df['lat'], errors='coerce').round(6)
losses_df['lon'] = pd.to_numeric(losses_df['lon'], errors='coerce').round(6)

# Drop 'nearest_location', 'extra', and 'geo' columns and rearange columns in new order
losses_df = losses_df.reindex(columns=['id', 'type', 'model', 'status', 'date', 'front', 'oblast', 'raion', 'town', 'lat', 'lon', 'tags'])  


In [17]:
losses_df.head()

Unnamed: 0,id,type,model,status,date,front,oblast,raion,town,lat,lon,tags
0,1,Tanks,T-64BV,Destroyed,2022-03-14,,,Sievierodonetsk raion,Rubizhne,49.027241,38.343374,
1,2,Tanks,T-64BV,Destroyed,2022-03-16,,,Mariupol raion,Mariupol,47.099126,37.523713,"Turretless, Z, Mine plow/roller"
2,3,Tanks,T-64BV,Destroyed,2022-03-16,,,Mariupol raion,Mariupol,47.098693,37.523532,Shattered
3,5,Tanks,T-64BV,Destroyed,2022-03-17,,,Mariupol raion,Mariupol,47.09814,37.640174,"Turretless, Z"
4,6,Tanks,T-64BV,Destroyed,2022-03-31,,,Sievierodonetsk raion,Rubizhne,49.01122,38.39844,Z


In [4]:
# debuging code to display selected rows 
# losses_df.iloc[14150:14155]

In [5]:
# debuging code to display selected rows 
# losses_df.iloc[3636:3650]

Load REGIONS and UNIQUE_LOCATIOS data to generate regions_dict and locations_dict

In [6]:
# Import regions and location data
regions_path = 'regions.csv'
unique_locations_path = 'unique_locations.csv'

regions = pd.read_csv(regions_path)
unique_locations = pd.read_csv(unique_locations_path)

# Create dictionaries
regions_dict = {
    front: group.set_index('oblast')['raion'].groupby(level=0).apply(list).to_dict()
    for front, group in regions.groupby('front')
}

locations_dict = unique_locations.set_index('raion')['unique'].to_dict()

In [7]:
# Debugging helper: Track unmatched raions and towns
missing_raions = []
missing_towns = []

# Loop through the "losses_df" DataFrame to populate missing oblasts and fronts
for index, row in losses_df.iterrows():
    front = row['front']
    oblast = row['oblast']
    raion = row['raion']
    town = row['town']

    # Find unique locations and add "raion"
    if pd.isna(raion) and pd.notna(town):
        for locations_raion, unique in locations_dict.items():  # Iterate through unique_dict
            if town.strip().lower() in unique.strip().lower():
                losses_df.at[index, 'raion'] = locations_raion
                raion = locations_raion 
                break  # Exit the loop once a match is found

    # Check if the oblast is missing
    if pd.isna(oblast):
        found = False  # Flag to track if a match is found

        # Loop through the regions_dict to find matching raions and oblasts
        for region_front, oblast_dict in regions_dict.items():
            for region_oblast, raions in oblast_dict.items():
                # Match based on raion
                if pd.notna(raion) and raion.strip().lower() in [r.strip().lower() for r in raions]:
                    # Update oblast and front in the losses_df DataFrame
                    losses_df.at[index, 'oblast'] = region_oblast
                    losses_df.at[index, 'front'] = region_front
                    found = True
                    break
            if found:
                break

        # If still unmatched, log the missing raion or town
        if not found:
            if pd.notna(raion):
                missing_raions.append(raion)
            elif pd.notna(town):
                missing_towns.append(town)

    else:
        # If oblast is present, match to determine the front
        for region_front, oblast_dict in regions_dict.items():
            if oblast.strip().lower() in [o.strip().lower() for o in oblast_dict.keys()]:
                # Update front in the losses_df DataFrame
                losses_df.at[index, 'front'] = region_front
                break

# Debug: Output unmatched raions and towns
if missing_raions:
    print("Unmatched raions:", set(missing_raions))
if missing_towns:
    print("Unmatched towns:", set(missing_towns))

# Output the updated DataFrame
losses_df.head()

Unnamed: 0,id,type,model,status,date,front,oblast,raion,town,lat,lon,tags
0,1,Tanks,T-64BV,Destroyed,2022-03-14,East Ukraine,Luhansk oblast,Sievierodonetsk raion,Rubizhne,49.027241,38.343374,
1,2,Tanks,T-64BV,Destroyed,2022-03-16,East Ukraine,Donetsk oblast,Mariupol raion,Mariupol,47.099126,37.523713,"Turretless, Z, Mine plow/roller"
2,3,Tanks,T-64BV,Destroyed,2022-03-16,East Ukraine,Donetsk oblast,Mariupol raion,Mariupol,47.098693,37.523532,Shattered
3,5,Tanks,T-64BV,Destroyed,2022-03-17,East Ukraine,Donetsk oblast,Mariupol raion,Mariupol,47.09814,37.640174,"Turretless, Z"
4,6,Tanks,T-64BV,Destroyed,2022-03-31,East Ukraine,Luhansk oblast,Sievierodonetsk raion,Rubizhne,49.01122,38.39844,Z


In [8]:
# debuging code to display selected rows 
# losses_df.iloc[8870:8888]

In [9]:
# debuging code
# Filter rows where 'oblast' column is None (or NaN)
#     empty_rows = losses_df[losses_df['front'].isna()]

# Output the filtered rows
#   empty_rows.to_csv('townsuknown.csv', index=False, encoding='utf-8')
#   print(empty_rows)

In [10]:
# debuging code 
# losses_df.iloc[16800:16815]

In [11]:
# Load tags_idex data
tags_path = 'tags_index.csv'
tags_df = pd.read_csv(tags_path)

In [12]:
# Create a mapping of tags to their indices
tag_to_index = {tag: idx for idx, tag in tags_df[['id', 'tag']].values}

# Replace each tag with its corresponding index
def replace_tags_with_indices(tag_string):
    if pd.notna(tag_string):  # Check for non-NaN values
        tags = [tag.strip().lower() for tag in tag_string.split(',')]  # Normalize tags
        return ','.join(str(tag_to_index[tag]) for tag in tags if tag in tag_to_index)
    return tag_string  # Return original value if NaN

# Apply the transformation to the 'tags' column
losses_df['tags'] = losses_df['tags'].apply(replace_tags_with_indices)

# Display the first few rows of the updated DataFrame
print(losses_df.info())
losses_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18382 entries, 0 to 18381
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   id      18382 non-null  int64         
 1   type    18382 non-null  object        
 2   model   18382 non-null  object        
 3   status  18382 non-null  object        
 4   date    18382 non-null  datetime64[ns]
 5   front   17311 non-null  object        
 6   oblast  16952 non-null  object        
 7   raion   14038 non-null  object        
 8   town    13075 non-null  object        
 9   lat     10714 non-null  float64       
 10  lon     10714 non-null  float64       
 11  tags    11336 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(8)
memory usage: 1.7+ MB
None


Unnamed: 0,id,type,model,status,date,front,oblast,raion,town,lat,lon,tags
0,1,Tanks,T-64BV,Destroyed,2022-03-14,East Ukraine,Luhansk oblast,Sievierodonetsk raion,Rubizhne,49.027241,38.343374,
1,2,Tanks,T-64BV,Destroyed,2022-03-16,East Ukraine,Donetsk oblast,Mariupol raion,Mariupol,47.099126,37.523713,"-941,-937,-948"
2,3,Tanks,T-64BV,Destroyed,2022-03-16,East Ukraine,Donetsk oblast,Mariupol raion,Mariupol,47.098693,37.523532,-944
3,5,Tanks,T-64BV,Destroyed,2022-03-17,East Ukraine,Donetsk oblast,Mariupol raion,Mariupol,47.09814,37.640174,"-941,-937"
4,6,Tanks,T-64BV,Destroyed,2022-03-31,East Ukraine,Luhansk oblast,Sievierodonetsk raion,Rubizhne,49.01122,38.39844,-937


In [13]:
losses_df.to_csv('russia_losses_cleaned.csv', index=False, encoding='utf-8')