In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re 
import time 


BASE_URL = "https://index.minfin.com.ua/en/russian-invading/casualties/"
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}



def get_html_content(url):
    try:
        response = requests.get(url, timeout=15, headers=HEADERS)
        response.raise_for_status() 
        return response.text
    except requests.exceptions.RequestException as e:
        return None

def find_all_month_links_updated(soup_obj, base_url):
    month_links = {}
    ajax_months = soup_obj.find_all('div', class_='ajaxmonth')
    
    for month_div in ajax_months:
        h4_element = month_div.find('h4', class_='normal')
        if h4_element:
            link_element = h4_element.find('a')
            if link_element:
                month_name = link_element.text.strip()
                relative_link = link_element.get('href')
                
                domain = "/".join(base_url.split('/')[:3]) + "/" 
                full_url = domain + relative_link 
                
                month_code = relative_link.split('/')[-2]
                month_links[month_name] = {'url': full_url, 'code': month_code}
    return month_links

def extract_all_daily_data(html_content, month_code=None):
    soup = BeautifulSoup(html_content, 'html.parser')
    final_long_data = []
    
    daily_sections = soup.find_all('li', class_='gold')
    
    for section in daily_sections:
        current_date_text = None
        date_span = section.find('span', class_='black')
        
        if date_span:
            current_date_text = date_span.text.strip()
        
        if not current_date_text:
            continue

        casualties_div = section.find('div', class_='casualties')
        
        if casualties_div:
            casualty_items = casualties_div.find_all('li')
            
            for item in casualty_items:
                data_text = item.text.strip()
                
                if '—' in data_text:
                    key_part, value_part = data_text.split('—', 1) 
                    
                    clean_key = key_part.strip().replace('\xa0', ' ')
                    if item.find('abbr'):
                        clean_key = item.find('abbr').text.strip()
                    elif 'missiles' in clean_key:
                        clean_key = 'Cruise missiles'
                    
                    total_value_raw = value_part.split('(')[0].strip()
                    
                    total_value_raw = re.sub(r'[^\d]', '', total_value_raw) 
                    
                    final_value = None
                    if total_value_raw:
                        try:
                            final_value = int(total_value_raw)
                        except ValueError:
                            pass
                            
                    if final_value is not None:
                        final_long_data.append({
                            'Date': current_date_text,
                            'Equipment type': clean_key,
                            'Total': final_value
                        })
                        
    return final_long_data

def fetch_full_month_data(month_code, initial_html, month_name):
    
    all_month_data = []
    
    if initial_html:
        month_data = extract_all_daily_data(initial_html, month_code) 
        all_month_data.extend(month_data)
    
    offset = len(month_data) 
    
    if offset > 0:
        print(f"   [INFO - {month_name}] Initial fetch of {offset} days. Looking for more data...")
    
    max_ajax_requests = 20
    request_count = 0
    
    while request_count < max_ajax_requests:
        request_count += 1
        
        ajax_url = f"https://index.minfin.com.ua/ajax/casualties/day_by_day.php?month={month_code}&offset={offset}&offset_month={month_code}"
        
        ajax_html = get_html_content(ajax_url)
        
        if not ajax_html or 'li class="gold"' not in ajax_html:
            break
        
        additional_data = extract_all_daily_data(ajax_html, month_code)
        
        if not additional_data:
            break
            
        all_month_data.extend(additional_data)
        
        offset += len(additional_data)
        
        print(f"   [INFO - {month_name}] Fetched {len(additional_data)} new days. Total days: {offset}")
        
        if len(additional_data) < 10: 
            break
            
        time.sleep(0.5) 

    return all_month_data

def master_scraper_to_excel():
    
    print(f"1. Fetching base URL: {BASE_URL}")
    initial_html = get_html_content(BASE_URL)
    
    if not initial_html:
        print("Initial HTML could not be fetched. Program stopped.")
        return pd.DataFrame()

    soup = BeautifulSoup(initial_html, 'html.parser')
    all_casualties_data = []

    try:
        current_month_title = soup.find('div', class_='title').find('h1').text.strip().split('for')[1].split('by')[0].strip()
    except:
        current_month_title = "Current Month"
    current_month_code = "2025-11" 
        
    print(f"\n2.  **{current_month_title}** month data (with full AJAX) is being fetched...")
    
    current_month_data = fetch_full_month_data(current_month_code, initial_html, current_month_title)
    all_casualties_data.extend(current_month_data)
    print(f"Final {len(current_month_data)} daily records added for **{current_month_title}**.")
    
    print("\n3. Finding links and codes for all old months...")
    all_month_links = find_all_month_links_updated(soup, BASE_URL)
    
    for month_name, link_info in all_month_links.items():
        url = link_info['url']
        month_code = link_info['code']
        
        print(f"\n4.  **{month_name}** month data (with full AJAX) is being fetched...")
        
        month_initial_html = get_html_content(url) 
        
        if month_initial_html:
            month_data = fetch_full_month_data(month_code, month_initial_html, month_name)
            all_casualties_data.extend(month_data)
            print(f"Final {len(month_data)} daily records added for **{month_name}**.")
            
    df = pd.DataFrame(all_casualties_data)

    excel_file_name = 'russia_military_casualties.xlsx'
    
    if not df.empty:
        df = df[['Date', 'Equipment type', 'Total']].drop_duplicates() 
        df.to_excel(excel_file_name, index=False)
        
        print("\n" + ("="*70))
        print(f" **Project Successfully Completed!**")
        print(f" A total of **{len(df)}** unique records were scraped and written to '{excel_file_name}'.")
        print(" Your Excel file is in the same folder.")
        print("="*70)
    else:
        print("\n Warning: No data was scraped. Excel file was not created.")
    
    return df

if __name__ == "__main__":
    master_scraper_to_excel()