In [31]:
import pandas as pd
import numpy as np

import geopandas as gpd
import sys

import os # for file system operations
import re # for regular expressions
from tqdm import tqdm # for progress bar

sys.path.append("..")
from my_modules import my_functions as mybib

%load_ext autoreload
%autoreload 2
!pip install openpyxl

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload




In [32]:
list_xlsx_files = os.listdir('../data/input/population')
list_xlsx_files

['SB_A01-16-00_2018h01_BE.xlsx',
 'SB_A01-16-00_2018h02_BE.xlsx',
 'SB_A01-16-00_2019h01_BE.xlsx',
 'SB_A01-16-00_2019h02_BE.xlsx',
 'SB_A01-16-00_2020h01_BE.xlsx',
 'SB_A01-16-00_2020h02_BE.xlsx',
 'SB_A01-16-00_2021h01_BE.xlsx',
 'SB_A01-16-00_2021h02_BE.xlsx',
 'SB_A01-16-00_2022h01_BE.xlsx']

In [33]:
def transform_number_under_10(x):
    if x < 10:
        x = '0' + str(x)
        return x
    else:
        x = str(x)
        return x

In [34]:
column_names = [i for i in range(0, 15)]

population_df = pd.DataFrame(columns=column_names)
display(population_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14


In [35]:
# Loop through all the files in the directory
for filename in tqdm(list_xlsx_files): 

    print(f'{filename}')
    
    # extract year and half_year from filename
    temp = re.search(r'(\d{4})h(\d{2})', filename)
    year, half_year = temp.groups()

    # create file path for file import 
    file_path = f'../data/input/population/{filename}'

    # import data from 'T2' sheet - checked in Excel 
    raw_data = pd.read_excel(file_path, 'T2', header = None)

    # checked with excel the dataset before -> after dropping rows with Nan values the needed information are extracted 
    final_rows = raw_data.dropna().index.tolist()
    pop_data = raw_data.iloc[final_rows]
        
    # Add 2 columns year and half_year
    pop_data.insert(0, 'year', year)
    pop_data.insert(1, 'half_year', half_year)
    
    # counter = counter + len(pop_data)
        
    population_df = pd.concat([population_df, pop_data], ignore_index=True , axis=0)

    # Convert columns to integer 
    columns_to_transform = population_df.columns[0:17]
    population_df[columns_to_transform] = population_df[columns_to_transform].astype(int)
    # print(population_df.dtypes)
    
    # Transform format for LOR values - (1 -> 01, 2 -> 02 ...)
    for col in [0,1,2,3]:
        population_df[col] = population_df[col].apply(lambda x: transform_number_under_10(x))
    


    # drop woman and asyl data columns 
    # transform 0-3 to LOR Code
    # Rearrange table columns    
    population_df['lor'] = population_df[0] + population_df[1] + population_df[2] + population_df[3]
    population_df['key'] = population_df['lor'] + '-' + population_df['year'].astype(str) + '-' +  population_df['half_year'].astype(str)
    
# Drop unnecessary columns 
population_df = population_df.drop(columns = [0, 1, 2, 3, 13, 14])

# Reorder and rename columns 
population_df = population_df.reindex(['key', 'year', 'half_year', 'lor', 4, 5, 6, 7, 8, 9, 10, 11, 12], axis = 1)
population_df.columns = ['key', 'year', 'half_year', 'lor', 'total_population', '-6years', '6-15years', '15-18years', '18-27years', '27-45years', '45-55years', '55-65years', '65+years']


# Save the final DataFrame to a pickle file 
population_df.to_pickle('../data/temp/population.pkl')
# population_df.to_excel('../data/output/total_population_dataset.xlsx', index = False)
population_df.to_csv('../data/temp/population.csv', index = False)

display(population_df.head(5))


  population_df = pd.concat([population_df, pop_data], ignore_index=True , axis=0)
 11%|█████████▎                                                                          | 1/9 [00:00<00:01,  5.39it/s]

SB_A01-16-00_2018h01_BE.xlsx


 22%|██████████████████▋                                                                 | 2/9 [00:00<00:01,  5.37it/s]

SB_A01-16-00_2018h02_BE.xlsx
SB_A01-16-00_2019h01_BE.xlsx


 44%|█████████████████████████████████████▎                                              | 4/9 [00:00<00:01,  4.51it/s]

SB_A01-16-00_2019h02_BE.xlsx


 56%|██████████████████████████████████████████████▋                                     | 5/9 [00:01<00:00,  4.71it/s]

SB_A01-16-00_2020h01_BE.xlsx
SB_A01-16-00_2020h02_BE.xlsx


 78%|█████████████████████████████████████████████████████████████████▎                  | 7/9 [00:01<00:00,  4.63it/s]

SB_A01-16-00_2021h01_BE.xlsx
SB_A01-16-00_2021h02_BE.xlsx


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:01<00:00,  4.57it/s]

SB_A01-16-00_2022h01_BE.xlsx


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:01<00:00,  4.61it/s]


Unnamed: 0,key,year,half_year,lor,total_population,-6years,6-15years,15-18years,18-27years,27-45years,45-55years,55-65years,65+years
0,01011101-2018-1,2018,1,1011101,3269,153,127,32,502,999,348,389,719
1,01011102-2018-1,2018,1,1011102,185,4,9,3,15,51,36,40,27
2,01011103-2018-1,2018,1,1011103,5073,254,329,122,578,1430,682,770,908
3,01011104-2018-1,2018,1,1011104,4781,277,396,135,912,1661,629,426,345
4,01011105-2018-1,2018,1,1011105,1322,83,64,13,103,455,199,185,220


In [36]:
population_df

Unnamed: 0,key,year,half_year,lor,total_population,-6years,6-15years,15-18years,18-27years,27-45years,45-55years,55-65years,65+years
0,01011101-2018-1,2018,1,01011101,3269,153,127,32,502,999,348,389,719
1,01011102-2018-1,2018,1,01011102,185,4,9,3,15,51,36,40,27
2,01011103-2018-1,2018,1,01011103,5073,254,329,122,578,1430,682,770,908
3,01011104-2018-1,2018,1,01011104,4781,277,396,135,912,1661,629,426,345
4,01011105-2018-1,2018,1,01011105,1322,83,64,13,103,455,199,185,220
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4307,12601032-2022-1,2022,1,12601032,5978,516,773,264,680,1398,700,628,1019
4308,12601133-2022-1,2022,1,12601133,11395,837,1455,443,1185,2743,1324,1260,2148
4309,12601134-2022-1,2022,1,12601134,15645,1187,1815,578,1681,3470,1770,1768,3376
4310,12601235-2022-1,2022,1,12601235,10650,770,1461,487,1266,2486,1268,1167,1745
