In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gdown

## Loading and cleaning

In [3]:
# Loading data from GDrive.

# Data with BDL merged,
file_id = "14VMCwHF2qoezvXykUvjfMNM1a573gQ8M"
url = f"https://drive.google.com/uc?id={file_id}"
file_extension = "downloaded_file.csv"
gdown.download(url, file_extension, quiet=False)
df = pd.read_csv(file_extension, sep=",")

Downloading...
From (original): https://drive.google.com/uc?id=14VMCwHF2qoezvXykUvjfMNM1a573gQ8M
From (redirected): https://drive.google.com/uc?id=14VMCwHF2qoezvXykUvjfMNM1a573gQ8M&confirm=t&uuid=eb5ce2ee-694e-46f6-bc78-620fd33875fb
To: E:\Magisterka\Kod\2024-mgr-sluzba-cywilna\Python\Code\downloaded_file.csv
100%|██████████| 511M/511M [00:43<00:00, 11.8MB/s] 
  df = pd.read_csv(file_extension, sep=",")


In [14]:
# Dropping columns unnecessary for statistical analyses.
df.drop(['Unnamed: 0', 'date_announced', 'nice_to_have', 'institution_address', 'workplace', 'department', 'city', 'position'], axis=1, inplace=True)

Unnamed: 0.1,Unnamed: 0,ad_id,result,job_field,position_category,position,education_level,work_time,vacancies,city,...,fluctuation,employment_contract,voivodeship,district,cena_mieszkan,mediana_wieku,saldo_migracji,wsp_feminizacji,wydatki_powiat,wynagrodzenia
0,0,130994,1,vet,stanowiska samodzielne,inspektor weterynaryjny,3,0.5,1,Olesno,...,0.064683,1.19,16,8,4196.0,448,-160.0,106.0,159276.0,625553
1,0,130994,1,vet,stanowiska samodzielne,inspektor weterynaryjny,3,0.5,1,Olesno,...,0.064683,1.19,16,8,4196.0,448,-160.0,106.0,159276.0,824
2,1,130970,0,tech/construction,stanowiska samodzielne,starszy specjalista,3,1.0,1,Bydgoszcz,...,0.03601,0.5,4,61,7213.0,449,-828.0,114.0,,747819
3,1,130970,0,tech/construction,stanowiska samodzielne,starszy specjalista,3,1.0,1,Bydgoszcz,...,0.03601,0.5,4,61,7213.0,449,-828.0,114.0,,985
4,2,130898,1,vet,stanowiska samodzielne,asystent,2,1.0,1,Chrzanów,...,0.101215,,12,3,5217.0,453,-196.0,107.0,148372.0,689873


In [None]:
# Changing "," to "." in numerical variables.
comma_cols_to_replace = {
    'mediana_wieku': 'mediana_wieku_float',
    'wydatki_powiat': 'wydatki_powiat_float',
    'wynagrodzenia': 'wynagrodzenia_float'
}

# Replace commas with dots and convert to float
for old_col, new_col in comma_cols_to_replace.items():
    df[new_col] = df[old_col].str.replace(',', '.', regex=False).astype(float)

# Optional: Drop original string columns
# df.drop(columns=comma_cols_to_replace.keys(), inplace=True)


## Feature engineering

In [None]:
df['filled_vacancies_per_all'] = df['result'] / df['vacancies']

In [13]:
# Variable correlation
numerical_columns_df = df[[
    'work_time',
    'ad_views',
    'year',
    'applicants',
    'salary_for_position',
    'women_percentage',
    'wsp_feminizacji',
    'fluctuation',
    'employment_contract',
    'cena_mieszkan',
    'mediana_wieku',
    'saldo_migracji',
    'wydatki_powiat',
    'wynagrodzenia',
    'result'
]]


correlation = numerical_columns_df.corr()['result']

# Display the correlation matrix
print(correlation)

ValueError: could not convert string to float: '44,8'

In [18]:
# Grupowanie i agregowanie danych
df_numerical = df[[]]

df_numerical_district_level = df_numerical.groupby(['district', 'voivodeship', 'year']).agg('mean').reset_index()
df_numerical_district_level.head(100)

Unnamed: 0,district,voivodeship,institution,year_announced,result,education_level,work_time,vacancies,views,promocja,kandydaci,salary_for_position,women_percentage,fluctuation,employment_contract
0,1,2,Archiwum Państwowe we Wrocławiu,2018,1.0,1.0,1.0,1.0,2792.0,,12.285714,4675.055928,0.642857,0.10326,17.88
1,1,2,Główny Inspektorat Transportu Drogowego w Wars...,2018,1.0,2.0,1.0,1.0,2546.0,,,,,,
2,1,2,Główny Inspektorat Transportu Drogowego w Wars...,2021,1.0,1.0,1.0,2.0,1288.0,,,,,,
3,1,2,Główny Inspektorat Transportu Drogowego w Wars...,2022,1.0,1.0,1.0,1.0,1455.0,,,,,,
4,1,2,Izba Administracji Skarbowej we Wrocławiu,2017,1.0,2.0,1.0,1.0,2606.5,,,,,,


## Statistics