In [1]:
import pandas as pd
import re
import json
import math
from pathlib import Path

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
CSV_DIR = Path('data/csv/')
JSON_DIR = Path('data/json/')

In [3]:
df = pd.read_csv(CSV_DIR / 'fashion_2023.csv', encoding="utf-8")
# remove unnamed columns (empty columns)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df

Unnamed: 0,COMPANY,COUNTRY,1. POLICY & COMMITMENTS,1.1 What are the company's human rights and environmental policies?,Animal Welfare,Annual Leave & Public Holidays,"Anti-bribery, Corruption, & Presentation of False Information",Biodiversity & Conservation,Community Engagement,Discrimination,...,"Publishes annual carbon footprint or GHG emissions in owned and operated facilities (e.g. head office, retail stores, distribution centres, warehouses, transport and mail orders, etc.)","Publishes annual value chain/scope 3 carbon footprint, focusing on detailed calculations on GHG emissions in the supply chain (e.g. at manufacturing and processing facilities, textile production), with estimations for downstream impacts","Publishes annual value chain/carbon footprint at a raw material level, with estimations for downstream impacts","Discloses data on absolute energy reduction in the supply chain (e.g. at manufacturing and processing facilities, fibre production level)","% of energy use coming from renewable sources in the company's owned and operated facilities (e.g. head office, retail stores, distribution centres, warehouses, etc.)","% of energy use coming from renewable sources in the supply chain (e.g. at manufacturing and processing facilities, fibre production level)",Discloses commitment to RE100,"Discloses what proportion of production is powered by coal, including which geographies and sectors are affected",Total Section 5.6,Total Score Section 5
0,Abercrombie & Fitch,USA,,,0.25,0.00,0.25,0.25,0.25,0.25,...,1,0,0,0,0,0,0,0,2,10
1,Adidas,Germany,,,0.25,0.00,0.25,0.25,0.25,0.25,...,1,2,2,0,0,0,0,1,11,35
2,Aeropostale,USA,,,0.00,0.00,0.25,0.00,0.00,0.00,...,0,0,0,0,0,0,0,0,0,0
3,AJIO,India,,,0.00,0.00,0.25,0.25,0.25,0.25,...,1,0,0,0,0,0,0,0,1,1
4,ALDI Nord,Germany,,,0.25,0.25,0.00,0.25,0.25,0.25,...,1,0,0,0,1,0,0,0,4,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,Wrangler,USA,,,0.25,0.00,0.25,0.25,0.25,0.25,...,0,0,0,0,0,0,0,0,1,8
246,Youngor,China,,,0.00,0.00,0.00,0.00,0.00,0.00,...,0,0,0,0,0,0,0,0,0,0
247,Zalando,Germany,,,0.25,0.00,0.25,0.25,0.25,0.25,...,1,2,2,0,1,0,0,0,10,28
248,Zara,Spain,,,0.25,0.00,0.25,0.25,0.25,0.25,...,1,2,2,0,1,0,0,0,14,52


In [66]:
df_max_score = pd.read_csv(CSV_DIR / 'max_score.csv', encoding="utf-8")
df_max_score["id"] = df_max_score.index
df_max_score = df_max_score.loc[:, ~df_max_score.columns.str.contains('^Unnamed')]

max_score_dict = {}

for _, row in df_max_score.iterrows():
    section = None
    subsection = None
    indicator = None
    for key, value in row.items():
        if key == "id":
            continue
        if key == "section":
            section = value.split('\n')[0].strip().replace("’", "'")
            if section not in max_score_dict:
                max_score_dict[section] = {}
        elif key == "subsection":
            subsection = value.split('\n')[0].strip()
            if subsection not in max_score_dict[section]:
                max_score_dict[section][subsection] = {}
        elif key == "indicator":
            indicator = value.strip()
            if indicator not in max_score_dict[section][subsection]:
                max_score_dict[section][subsection][indicator] = None
        else:
            max_score_dict[section][subsection][indicator] = value

In [67]:
with open(JSON_DIR / "max_scores.json", "w", encoding="UTF-8") as f:
    f.write(json.dumps(max_score_dict, ensure_ascii=False, indent=4))

In [4]:
with open(JSON_DIR / "max_scores.json", "r", encoding="UTF-8") as f:
    max_score_dict = json.load(f)

In [6]:
from collections import Counter
country_count = Counter(df['COUNTRY'])
data = []
company_normalized = []
for _, row in df.iterrows():
    result = {}
    result_normalized = {"subcategories": {}}
    main_category = None
    sub_category = None
    c = 0
    normalized_score_subcategory = 0
    for key, value in row.items():
        key_name = key.strip().lower()
        if key_name.startswith("total"):
            continue
        if key_name == 'company':
            result['Company'] = value
            result_normalized['Company'] = value
        elif key_name == 'country':
            result['Country'] = value
        elif re.match(r'^\d.[^\d]', key_name):
            main_category = key.split('\n')[0].strip()
            result[main_category] = {}
        elif re.match(r'^\d.\d.', key_name):
            if c != 0:
                result_normalized['subcategories'][sub_category]["normalized"] = normalized_score_subcategory / c
            sub_category = key.split('\n')[0].strip()
            result[main_category][sub_category] = {}
            result_normalized['subcategories'][sub_category] = {"detailed": {}}
            c = 0
            normalized_score_subcategory = 0
        else:
            c += 1
            key_name = key.strip()
            if re.search(r'\.\d$', key_name):
                key_name = key_name[:-2]
            result[main_category][sub_category][key_name.strip()] = value
            max_score = max_score_dict[main_category][sub_category][key_name.strip()]
            if max_score == 0:
                max_score = 1
            # check if value is not a number
            if math.isnan(value):
                value = 0
            normalized_score = value / max_score
            normalized_score_subcategory += normalized_score
            result_normalized['subcategories'][sub_category]["detailed"][key_name.strip()] = normalized_score
    company_normalized.append(result_normalized)
    data.append(result)

In [7]:
with open(JSON_DIR / "fashion_2023_normalized.json", "w", encoding="UTF-8") as f:
    f.write(json.dumps(company_normalized, indent=4))

In [63]:
with open(JSON_DIR / 'fashion_2023.json', 'w') as f:
    json.dump(data, f, indent=4)

In [41]:
result = {}
for i in range(1, 6):
    section_file = CSV_DIR / f'section{i}.csv'
    df = pd.read_csv(section_file, encoding="utf-8", delimiter=';')
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    for _, row in df.iterrows():
        company = None
        for key, value in row.items():
            key_name = key.strip().lower()
            if key_name == "brand name":
                if value not in result:
                    result[value] = {}
                company = value
            elif "final scores" in key_name:
                for year in range(2017, 2024):
                    if str(year) in key_name:
                        if str(year) not in result[company]:
                            result[company][str(year)] = {}
                        if isinstance(value, float) and math.isnan(value):
                            value = None
                        else:
                            value = float(value.replace(',', '.')) if isinstance(value, str) else value
                        result[company][str(year)][f"section{i}"] = value
                        break
with open(JSON_DIR / 'section.json', 'w') as f:
    json.dump(result, f, indent=4)
