# Data Cleaning

In [12]:
from os import listdir
from os.path import isfile, isdir
import pandas as pd
import re
import unicodedata
import numpy as np

In [13]:
SOURCE_PATH = './Datasets/'
CLEANED_DATASETS_PATH = './Cleaned Datasets/'
COLUMNS_TO_DROP = ['id', 'unnamed:_0']
COLUMNS_WITH_AMOUNT = ['pricecap', 'price', 'shareprice', 'marketcap', 'gbp', 'total_raised', 'valuation',
                       'share_price', 'market_cap', 'master_cap', 'revenue', 'market_value_apr_2022',
                       'annual_revenue_in_usd', 'annual_net_income_in_usd', 'market_value', 'marketvalue',
                       'annual_revenue', 'revenue_2022', 'net_income', 'annual_results_for_year_ending', 'total_assets_in_usd', 'total_liabilities_in_usd', 'total_equity_in_usd', 'market_value_(jan_1st_2020)', 'market_value_(jan-07-2022)', 'market_value_jan_2020', 'market_value_jan_2021', 'market_value_jan_2022']
COLUMNS_WITH_NAME = ['name', 'company', 'ceo', 'founders']

In [14]:
def clean_dataset(dataset):
    dataset.columns = [re.sub(r'([a-z])([A-Z])', r'\1 \2', col).lower().replace(' ', '_') for col in dataset.columns]

    for col in COLUMNS_TO_DROP:
        if col in dataset.columns:
            dataset.drop(col, axis=1, inplace=True)

    dataset.dropna(axis=0, how='all', inplace=True)
    dataset.fillna('', inplace=True)

    for col in dataset:
        dataset[col] = [', '.join(map(str, l)) if isinstance(l, list) else l for l in dataset[col]]

        if dataset[col].dtype == object:
            dataset[col] = dataset[col].str.lower()

            if col in COLUMNS_WITH_NAME:
                dataset[col] = clean_name_column(dataset[col])
            elif col in COLUMNS_WITH_AMOUNT:
                dataset[col] = clean_amount_column(dataset[col])
            else:
                dataset[col] = clean_general_column(dataset[col])

    return dataset


def get_multiplier(value):
    if re.compile(r'\s*million|\sm').search(value):
        return 1000000
    elif re.compile(r'\s*billion|\sb').search(value):
        return 1000000000
    elif re.compile(r'\s*trillion|\st').search(value):
        return 1000000000000
    else:
        return 1


def clean_amount(amount):
    return re.sub(r'million|m|billion|b|trillion|t|usd|us|\$|\(([a-z0-9\s]+)\)', '', amount)\
        .replace(',', '.')\
        .strip()


def clean_name_column(column):
    return column.str.replace(r'dr. |dr |ceo |ceo:|none|mr. |mr |ms. |ms |not found|[,._?!()^";@:#+*\t]', '', regex=True)\
        .str.replace('\\n|\s\s+', ' ', regex=True)\
        .str.strip()


def clean_general_column(column):
    return column.str.replace('\\r\\n|employees:?|founded:?|none|not found|[\\n_?!()^";#*\t]', ' ', regex=True)


def clean_amount_column(column):
    l = []
    for elem in column:
        elem = elem.lower()
        cleaned = clean_amount(elem)
        multiplier = get_multiplier(elem)
        if re.match('^[0-9]+(.[0-9]+)?$', cleaned):
            l.append(int(float(cleaned) * multiplier))
        else:
            l.append(elem)

    return l


for directory in listdir(SOURCE_PATH):
    dir_path = SOURCE_PATH + directory
    if isdir(dir_path):
        for f in listdir(dir_path):
            file_path = dir_path + '/' + f
            file_name = file_path[file_path.rfind('/') + 1:]
            json_file_path = ''
            df = None

            if isfile(file_path):
                if file_name.endswith('.csv'):
                    json_file_name = file_name.replace('.csv', '_cleaned.csv')
                    json_file_path = CLEANED_DATASETS_PATH + json_file_name
                    print(json_file_name)

                    try:
                        df = pd.read_csv(file_path, encoding='utf-8', dtype=object)
                    except UnicodeDecodeError:
                        df = pd.read_csv(file_path, encoding='unicode_escape', dtype=object)
                    except:
                        print('ERROR: failed to read CSV file')

                elif file_name.endswith('.json'):
                    json_file_name = file_name.replace('.json', '_cleaned.csv')
                    json_file_path = CLEANED_DATASETS_PATH + json_file_name
                    print(json_file_name)
                    df = pd.read_json(file_path, encoding='utf-8', dtype=object)
                elif file_name.endswith('.jsonl'):
                    json_file_name = file_name.replace('.jsonl', '_cleaned.csv')
                    json_file_path = CLEANED_DATASETS_PATH + json_file_name
                    print(json_file_name)
                    df = pd.read_json(file_path, encoding='utf-8', lines=True, dtype=object)

                if df is not None:
                    df = clean_dataset(df)
                    df = df.replace(r'^\s*$', None, regex=True)
                    json = re.sub(r'\\*/', '/', df.to_csv(index=False))
                    json = unicodedata.normalize('NFKD', json).encode('ascii', 'ignore').decode('utf-8')
                    print(json, file=open(json_file_path, encoding='utf-8', mode='w'))

forbes_fr_cleaned.csv
wiki_GioPonSPiz_cleaned.csv
Wikipedia_MarScoToc_cleaned.csv
famcap_germany_FR_cleaned.csv
govuk_DeBiGa_cleaned.csv
cbinsights_DDD_cleaned.csv
cbinsights_iGMM_cleaned.csv
globaldata_DeBiGa_cleaned.csv
CompaniesMarketCap_GioPonSpiz_cleaned.csv
companiesMarketCap_Avengers_cleaned.csv
CompaniesMarketCap_MarScoToc_cleaned.csv
companiesmarketcap_gren_cleaned.csv
companiesmarketcap_DDD_cleaned.csv
companiesmarketcap_iGMM_cleaned.csv
disfold_Avengers_cleaned.csv
disfold_iGMM_cleaned.csv
Disfold_MarScoToc_cleaned.csv
disfold_gren_cleaned.csv
disfold_GioPonSpiz_cleaned.csv
disfold_fr_cleaned.csv
disfold_silvestri_cleaned.csv
disfold_DeBiGa_cleaned.csv
disfold_slyherin_cleaned.csv
hitHorizons_Avengers_cleaned.csv
ft_iGMM_cleaned.csv
ft_Slytherin_cleaned.csv
ft_gren_cleaned.csv
ft_fr_cleaned.csv
ft_DDD_cleaned.csv
ft_silvestri_cleaned.csv
valueToday_Avengers_cleaned.csv
valueToday_GioPonSPiz_cleaned.csv
valuetoday_silvestri_cleaned.csv
valuetoday_fr_cleaned.csv
valuetoday_sly