In [None]:
import pandas as pd
import numpy as np
import warnings


def txt_to_csv(file_name):
    """
    A function to change .txt file to .csv file

    Parameters:
    file_name (str): the name of the .txt file

    Returns:
    None
    """
    #data = pd.read_csv(f'data_txt\\{file_name}', sep=",", encoding='utf-8', dtype='str')
    data = pd.read_csv(f'txt\\{file_name}', sep=",", encoding='utf-8', dtype='str')
    data.to_csv(f'data_csv\\{file_name[:-4]}.csv', index=False, encoding='utf-8')

# see all the files in the folder 'data_txt'
def see_files(foldername):
    from os import listdir
    from os.path import isfile, join
    files = [f for f in listdir(foldername) if isfile(join(foldername, f))]
    # change to list of file names as string
    files = [str(file) for file in files]
    
    return files

# define the columns to keep and their possible name variations
columns = [{'region': ['reg']}, 
           {'province': ['cwd', 'cwt']}, 
           #'amp', 
           #'tmb', 
           {'municipal': ['area']}, 
           #'ea', 
           #'vil', 
           #'psu_no', 
           #'ea_set', 
           #'samset', 
           {'month': ['month', 'mounth']}, 
           {'year': ['yr']}, 
           #'hh_no', 
           #'tpye', 
           #'member', 
           #'listing', 
           #'enum', 
           #'no', 
           {'relation': ['ralation', 'rela']}, 
           {'male': ['sex']}, 
           {'age': ['age']}, 
           {'marital': ['marital']}, 
           {'studying': ['grade_a']}, 
           {'edu_level': ['grade_b']}, 
           {'edu_field': ['subject']}, 
           {'wk_7day': ['wk_7day', 'work']}, 
           {'receive': ['receive', 'receiv']}, 
           {'return': ['return']}, 
           {'absent': ['absent']}, 
           {'seeking': ['seeking', 'seek']}, 
           {'method': ['method']}, 
           {'available': ['aviala', 'avai']}, 
           {'re_unavailable': ['re_una', 're_unavail']}, 
           {'reno_se': ['reno_se', 're_no_seek']}, 
           {'dr_seek': ['dr_se', 'dr_see']}, 
           {'ever_wk': ['ever_wk']}, 
           {'re_unem': ['re_unem']}, 
           {'dr_unem': ['dr_unem']}, 
           #'occup1', 
           #'occup2', 
           #'occup3', 
           {'occupation': ['occup']}, 
           #'ind1', 
           #'ind2', 
           #'ind3', 
           #'ind4', 
           {'industry': ['indus']}, 
           {'position': ['status']}, 
           {'firm_size': ['size', 'size_']}, 
           {'main_hr': ['main_hr']}, 
           {'part_time_hr': ['other_hr']}, 
           {'total_hr': ['tot_hr']}, 
           {'more_wk': ['more_wk']}, 
           {'more_hr': ['more_hr']}, 
           {'finding': ['finding']}, 
           {'re_nomore': ['re_no', 're_nomore', 're_nom']}, 
           {'wage_type': ['wage_ty', 'wg_ty']}, 
           {'amount': ['amount', 'amoun']}, 
           {'wage_per_month': ['approx']}, 
           {'wage_bonus': ['bonus']}, 
           {'wage_ot': ['ot']}, 
           {'other_income': ['oth_mon', 'oth_money']}, 
           {'exp_food': ['food']}, 
           {'exp_cloth': ['cloth', 'colth']}, 
           {'exp_house': ['house']}, 
           {'exp_others': ['oth_thi', 'oth_th', 'oth_thing']}, 
           #'lst_m', 
           {'re_wk': ['re_wk']}, 
           {'re_ed': ['re_ed']}, 
           {'weight': ['weight', 'wgt', 'wgt_cwt']} 
           #'who', 
           #'age_g', 
           #'age_g1', 
           #'edu', 
           #'edu1', 
           #'wkcode', 
           #'wksta', 
           #'wksta1', 
           #'PROVINCE', 
           #'indus_g', 
           #'indus_g1', 
           #'indus_g2', 
           #'indus_g3', 
           #'totalhr', 
           #'g_hr', 
           #'timeseries', 
           #'year', 
           #'quarter', 
           #'minwgae', 
           #'wage_day', 
           #'_v1', 
           #'low_wage', 
           #'new_wage'
           ]

# get a list of all keys from list of dict 'columns'
column_master = [list(d.keys())[0] for d in columns]


def clean_data(years, quarters, columns, column_master):
    for year in years:
        for quarter in quarters:
            try:
                #df = pd.read_csv(f'data_csv/LFS_{year}q{quarter}.csv', encoding='utf-8', dtype='str')
                df = pd.read_csv(f'data_csv/LFS_{year}q{quarter}.csv', encoding='utf-8', dtype='str')
                # change all headers to lowercase
                df.columns = df.columns.str.lower()
                for column in columns:
                    for key, value in column.items():
                        test = 0
                        for v in value:
                            if v in df.columns:
                                df.rename(columns={v: key}, inplace=True)
                                test = 1
                                if test == 0:
                                    print(f'{key} not found in {year}q{quarter}')
                                    break
                # if df contains all columns in 'column_master' then filter out only those columns, 
                # but if not then create those missing columns and merge with the original df with NaN values, also print out what columns are missing
                if set(column_master).issubset(df.columns):
                    df = df[column_master]
                    # drop duplicate columns
                    df = df.loc[:,~df.columns.duplicated()]
                else:
                    missing_columns = list(set(column_master) - set(df.columns))
                    print(f'{year}q{quarter} has some missing columns: {missing_columns}')
                    for col in missing_columns:
                        df[col] = None
                    df = df.loc[:,~df.columns.duplicated()]
                    df = df[column_master]
                # remove rows with all empty values
                df = df.replace(r'^\s*$', np.nan, regex=True)
                df.to_csv(f'data_csv_cleaned/LFS_{year}q{quarter}.csv', index=False, encoding='utf-8')
                print(f'{year}q{quarter} done')
            except Exception as e:
                print(f"{year}q{quarter} -- error occurred: {e}")


def get_years_list(start_year, end_year):
    years = [str(year) for year in range(start_year, end_year+1)]
    return years