In [50]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

# URL of the webpage
url = 'https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-equipment.html'

# Fetch the page content
response = requests.get(url)
response.raise_for_status()

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# List to store equipment data with categories
equipment_data = []

# Find all <span> elements with a class of 'mw-headline' and corresponding <ul> tags
for span in soup.find_all('span', class_='mw-headline'):
    section_name = span.get_text(strip=True)
    
    if span.find_parent('h3'):        
        ul_tag = span.find_parent('h3').find_next('ul')  # Find the <ul> items under <h3>
        
        if ul_tag: # Extract the <li> items
            for li in ul_tag.find_all('li'):
                raw_text = li.get_text(strip=True)
                cleaned_text = raw_text.split(':')[0]  # Only keep the text before the semicolon
                equipment_data.append([section_name, cleaned_text]) # Add the equipment along with its category

# Convert the equipment data into a DataFrame
equipment_df = pd.DataFrame(equipment_data, columns=['Category', 'Equipment'])

# Display the DataFrame
print(equipment_df)

                        Category              Equipment
0                          Tanks              2 T-54-3M
1                          Tanks                2 T-54B
2                          Tanks                3 T-55A
3                          Tanks      1 T-55A Obr. 1981
4                          Tanks      5 Unknown T-54/55
..                           ...                    ...
294  Trucks, Vehicles, and Jeeps   1 UAZ-515195 'Esaul'
295  Trucks, Vehicles, and Jeeps           21 UAZ-39094
296  Trucks, Vehicles, and Jeeps  6 Unknown fuel tanker
297  Trucks, Vehicles, and Jeeps    270 (Unknown) truck
298  Trucks, Vehicles, and Jeeps   57 (Unknown) vehicle

[299 rows x 2 columns]


In [51]:
equipment_df = equipment_df.copy()

In [52]:
# Drop rows containing "Unknown" (case-insensitive)
equipment_df = equipment_df[~equipment_df['Equipment'].str.contains('Unknown', case=False, na=False)]

In [53]:
# Keep only the text after the first space in each row (drop the count number)
equipment_df['Equipment'] = equipment_df['Equipment'].str.split(' ', n=1).str[1]
# Keep only the text before the first space in each row 
equipment_df['Equipment'] = equipment_df['Equipment'].str.split(' ', n=1).str[0]

# Remove duplicate values in the 'Equipment' column
clean_equipment_df = equipment_df.drop_duplicates(subset=['Equipment'], keep='first')

clean_equipment_df

Unnamed: 0,Category,Equipment
0,Tanks,
3,Tanks,T-55A
5,Tanks,T-62
7,Tanks,T-62M
9,Tanks,T-62MV
...,...,...
291,"Trucks, Vehicles, and Jeeps",UAZ-23632pickup
292,"Trucks, Vehicles, and Jeeps",UAZ-23632-148-64
293,"Trucks, Vehicles, and Jeeps",UAZ-394511
294,"Trucks, Vehicles, and Jeeps",'Esaul'


In [25]:
#save the DataFrame to a CSV file
clean_equipment_df.to_csv('equipment_by_category.csv', index=False)