In [1]:
import requests
import time
import os
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from processed_data import process_population_data, save_to_excel

In [5]:
def get_years():
    try:
        # Sending POST request
        response = requests.post("https://boardsgenerator.cbs.gov.il/Handlers/WebParts/YishuvimHandler.ashx?dataMode=Yeshuv&language=English&mode=Filters&subject=Population")

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse and return the JSON response
            return response.json()
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return None

    except requests.RequestException as e:
        print(f"Request error: {e}")
        return None

In [6]:
years = get_years()["Years"]
years.reverse()

In [7]:
years

[2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]

In [5]:
def get_token(year):
    try:
        headers = {
            "Content-Type": "application/x-www-form-urlencoded",
        }

        data = {
            "mode": "excel",
            "drillDownMode": "undefined",
            "board": "WebPartYishuvim",
            "query": '{"mode":"GridData","dataMode":"Yeshuv","subject":"Population","filters":{"Years":'+str(year)+'},"filtersearch":"","pageNumber":1,"search":"","language":"English"}',
            "filters": "",
            "cartDataImage": "",
            "cartDataImageOnly": "undefined",
            "lang": "English",
        }

        token_response = requests.post("https://boardsgenerator.cbs.gov.il/Handlers/General/ExportData.ashx", headers=headers, data=data)

        # Check the response
        if token_response.status_code == 200:
            return token_response.text
        else:
            return None
    except : 
        print("Token Error")

In [6]:
def get_file_size(file_content): return len(file_content.content) / 1024

In [7]:
def automateExtractionData(year, max_retries=5):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0'}
        
        # Retry the request up to max_retries times
        for retry_count in range(max_retries):
            download_response = requests.get('https://boardsgenerator.cbs.gov.il/Handlers/General/Downloader.ashx?type=excel&token='+get_token(year), headers=headers)
            
            if download_response.status_code == 200 and download_response.content and get_file_size(download_response) > 3: 
                # Save the downloaded file
                new_location = "C:\\Users\\tioua\\OneDrive\\Desktop\\BI PROJECT\\Data\\NumberOfInhabitants"
                file_path = f'{new_location}\\data_{year}.xlsx'
                with open(file_path, 'wb') as file:
                    file.write(download_response.content)
                print(f'Data for year {year} downloaded successfully')
                return True
            else:
                print(f'Failed to download data for year {year}. Retrying...')
                time.sleep(5)  # Add a short delay before retrying

        print(f'Exceeded maximum retry attempts for year {year}.')
        return False

    except Exception as e:
        print(f'Extraction Error: {e}')
        return False

In [8]:
def automateExtractionData_parallel(year_start,year_end):
    try:
        with ThreadPoolExecutor(max_workers=5) as executor:
            # Use list comprehension to submit tasks to the ThreadPoolExecutor
            tasks = [executor.submit(automateExtractionData, year) for year in range(year_start,year_end+1)]
            
            # Collect results
            results = [task.result() for task in as_completed(tasks)]
            time.sleep(2)
            # Iterate through results and print a summary
            for success, year_result in results:
                if success:
                    print(f'Data for year {year_result} downloaded successfully')
                else:
                    print(f'Failed to download data for year {year_result}')

    except Exception as e:
        print(f"Error: {e}")

In [None]:
#Execute code : 
automateExtractionData_parallel(years[0],years[len(years)-1])

In [2]:
def append_all_excel_files():
    data_directory = "C:\\Users\\tioua\\OneDrive\\Desktop\\BI PROJECT\\Data\\NumberOfInhabitants"
    files = os.listdir(data_directory)
    for file in files : 
        file_path = data_directory+"\\"+file
        year = int(re.search(r'\d{4}',file).group())
        save_to_excel(data_frame=process_population_data(file_path,year))
        time.sleep(1)

In [3]:
append_all_excel_files()

Data appended successfully.
Data appended successfully.
Data appended successfully.
Data appended successfully.
Data appended successfully.
Data appended successfully.
Data appended successfully.
Data appended successfully.
Data appended successfully.
Data appended successfully.
Data appended successfully.
Data appended successfully.
Data appended successfully.


2021