# Hugging Face Preprocessing

The following script preprocess the resulting dataset from HFExtraction.ipynb. In particular we focus on feature engineering, variable standardization/harmonization, and one-hot encoding of tags. We create variables such as *co2_reported*, *auto*, *year_month*, and *domain* for filtering, splitting datasets, and analyzing model behavior across domains. To avoid memory overflow on the one-hot, we first splitted the dataset and one-hoted each split independently, concatenating the results afterwards.

In order to execute the script ensure to run the cells in order. The script can take +5 minutes to finish.

In [2]:
import pandas as pd
import numpy as np
import re
import ast
import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats


pd.options.mode.chained_assignment = None

## Preprocessing of raw Hugging Face data

In [2]:
df = pd.read_csv('../../datasets/HFTotal.csv')

  df = pd.read_csv('../../datasets/HFTotal.csv')


In [3]:
with open('../../metadata/tags_metadata.yaml') as file:
    tags_metadata = yaml.safe_load(file)

In [4]:
def split_df(df):
    if len(df) % 2 != 0:  # Handling `df` with `odd` number of rows
        df = df.iloc[:-1, :]
    df1, df2, df3, df4 =  np.array_split(df, 4)
    return df1, df2, df3, df4

def select_top_tags(df, n):
    not_tags = [col for col in df.columns if not col.startswith('is_')]
    tags_names = [col for col in df.columns if col.startswith('is_')]
    relevant_tags = df[tags_names].sum(axis=0) > n
    relevant_tags = [index for index, value in zip(relevant_tags.index, relevant_tags.values) if value]
    df = df[not_tags + relevant_tags]
    return df


def one_hot_tags(df):
    df_onehot = df.drop('tags', axis=1).join(
        pd.get_dummies(
            pd.DataFrame(df.tags.tolist(), df.index).stack(),
            prefix='is', prefix_sep='_'
        ).astype(int).groupby(level=0).sum()
    )
    return df_onehot


def enlarge_tag_to_domain_dict(tag, tag_to_domain):
    if 'gpt' in tag or 'bert' in NLP or 'bart' == tag or 't5' == tag:
        tag_to_domain[tag] = 'NLP'
    elif 'bert' in tag:
        tag_to_domain[tag] = 'NLP'
    
def most_common(lst):
    return max(set(lst), key=lst.count)
    
def assign_model_domain(tags, tags_to_domain):  
    tags_to_domain = tags_to_domain['task_to_domain'] | tags_to_domain['model_to_domain'] | tags_to_domain['concepts_to_domain'] 
    model_domains = set()
    tags_domain = set()
    for tag in tags:
        if tag in tags_to_domain:
            model_domains.add(tags_to_domain[tag])
            tags_domain.add(tag)
    

    if len(model_domains) == 0:
        return None
    
    if len(model_domains) > 1:
        if 'feature-extraction'in tags_domain:
            if len(model_domains) == 2:
                return 'NLP'
        if 'Multimodal' in model_domains:
            return 'Multimodal'
        
        return most_common(list(model_domains))
    
    return model_domains.pop()
    
    
def filter_tags(tags, tags_metadata):
    languages_list = tags_metadata['languages']
    return [tag for tag in tags if tag not in languages_list and not tag.startswith(('license:', 'arxiv:', 'dataset:', 'doi:'))]
    
def tags_treatment(tags, tags_metadata):
    tags = ast.literal_eval(tags) if not isinstance(tags, list) else tags
    tags = filter_tags(tags, tags_metadata)
    tags = ['no-tag'] if not tags else tags        
    return [str(tag).lower().replace(' ', '-') for tag in tags]

def set_library(library_name, tags, libraries_list):
    libraries = [tag for tag in tags if tag in libraries_list]
    library_name = list(set(libraries + [library_name])) if not isinstance(library_name, list) else list(set(libraries + library_name))
    return [library for library in library_name if library is not None and not pd.isnull(library)]

def set_language(tags, languages_list):
    languages = [tag for tag in tags if tag in languages_list]
    return list(set(languages))

def set_license(tags, licenses_list):
    licenses = [tag for tag in tags if tag in licenses_list]
    return list(set(licenses))

def concat_dataset_splits(df1,df2,df3,df4):
    all_columns = set(df1.columns).union(df2.columns).union(df3.columns).union(df4.columns)

    # Concatenate the DataFrames vertically with an outer join
    df = pd.concat([df1, df2, df3, df4], axis=0, join='outer', ignore_index=True)

    # Find the non-shared columns
    dfs = [df1, df2, df3, df4]
    non_shared_columns = set()

    for i, df1 in enumerate(dfs):
        for df2 in dfs[i + 1:]:
            non_shared_columns = non_shared_columns.union(set(df1.columns).symmetric_difference(df2.columns))

    # Fill NaN values with zeros only for non-shared columns
    for col in non_shared_columns:
        df[col] = df[col].fillna(0)
        
    return df

def normalize_performance_metric(metric, metric_name):
    if isinstance(metric, float):
        if metric_name in ['f1', 'accuracy']:
            if float(metric) > 1:
                return float(metric)/100
        return float(metric)
        
    if metric is None or pd.isnull(metric) or '[' in metric:
        return None

    if '%' in metric:
        metric = float(metric.replace('%', ''))
    elif ',' in metric:
        metric = float(metric.replace(',', '.'))
    elif isinstance(ast.literal_eval(metric), dict):
        metric = float(ast.literal_eval(metric)[metric_name])
    
    if metric_name in ['f1', 'accuracy']:
        if float(metric) > 1:
            return float(metric)/100
    if float(metric) > 1:
        print(metric, metric_name)
    return float(metric)


def extract_context_info(text):
    patterns = {
        'hardware_type': r'[-*]*\s*(?:hardware|Hardware)(?:\s*[Tt]ype)*\s*[:]+\s*(.+)',
        'hours_used': r'[-*]*\s*Hours used\s*[:]+\s*(.+)',
        'cloud_provider': r'[-*]*\s*[Cc]loud [Pp]rovider\s*[:]+\s*(.+)',
        'compute_region': r'[-*]*\s*[Cc]ompute [Rr]egion\s*[:]+\s*(.+)',
        'carbon_emitted': r'[-*]*\s*[Cc]arbon [Ee]mitted\s*[:]+\s*(.+)',
        'training_type': r'[-*]*\s*(?:training|Training)(?:\s*[Tt]ype)*\s*[:]+\s*(.+)',

    }
    
    if text is None or pd.isnull(text):
        return {'hardware_type':None, 'hours_used':None, 'cloud_provider':None, 'compute_region':None, 'carbon_emitted':None, 'training_type':None}
    

    results = {}
    for key, pattern in patterns.items():
        regex = re.compile(pattern, re.IGNORECASE)
        match = regex.search(text)
        results[key] = match.group(1) if match else None

    return results

def context_metrics_treatment(df):
    context_info_results = [extract_context_info(text) for text in df['modelcard_text']]
    

    null_phrases = ['Unknown', 'unknown', 'needed', 'Needed']
    df['hardware_used'] = [context_info_results[idx]['hardware_type'] 
                           if pd.isnull(x) and all([phrase not in str(context_info_results[idx]['hardware_type']) for phrase in null_phrases]) else x 
                           for idx, x in enumerate(df['hardware_used'])] 
    df['geographical_location'] = [context_info_results[idx]['compute_region'] 
                                   if pd.isnull(x) and all([phrase not in str(context_info_results[idx]['compute_region']) for phrase in null_phrases]) else x 
                                   for idx, x in enumerate(df['geographical_location'])] 
    df['co2_eq_emissions'] = [context_info_results[idx]['carbon_emitted'] 
                              if pd.isnull(x) and all([phrase not in str(context_info_results[idx]['carbon_emitted']) for phrase in null_phrases]) else x 
                              for idx, x in enumerate(df['co2_eq_emissions'])] 
    df['hours_used'] = [context_dict['hours_used'] for context_dict in context_info_results]  
    df['cloud_provider'] = [context_dict['cloud_provider'] for context_dict in context_info_results]
    
    return df

def performance_metrics_treatment(df):
    df['f1'] = df['f1'].apply(lambda x: normalize_performance_metric(x, 'f1'))
    df['accuracy'] = df['accuracy'].apply(lambda x: normalize_performance_metric(x, 'accuracy'))
    df['rouge1'] = df['rouge1'].apply(lambda x: normalize_performance_metric(x, 'rouge1'))
    df['rougeL'] = df['rougeL'].apply(lambda x: normalize_performance_metric(x, 'rougeL'))
    return df


def harmonize_co2(co2):
    
    if isinstance(co2, float):
        return co2
    
    if pd.isnull(co2):
        return None
    co2_found = re.match(r'\d+\.\d+|\d+', co2)

    if co2_found is None:
        return None
    
    return float(co2_found.group(0))


def min_max_normalize(series):
    return (series - series.min()) / (series.max() - series.min())

def performance_score(df):
    metrics = ['accuracy', 'f1', 'rouge1', 'rougeL']
                                                                                                            
    df['f1'] = min_max_normalize(df['f1'])
    df['accuracy'] = min_max_normalize(df['accuracy'])
    df['rouge1'] = min_max_normalize(df['rouge1'])
    df['rougeL'] = min_max_normalize(df['rougeL'])
    return df.apply(lambda row: stats.hmean([row[metric] for metric in metrics if not np.isnan(row[metric])]), axis=1)

In [5]:
df = performance_metrics_treatment(df)
df = context_metrics_treatment(df)

#Curation
df['modelId'] = df['modelId'].apply(lambda x: x.split('/')[1] if len(x.split('/')) > 1 else x) #remove author from modelId
df['co2_eq_emissions'] = df['co2_eq_emissions'].apply(lambda co2: harmonize_co2(co2)) #harmonize plenty of co2 values
df['tags'] = df['tags'].apply(lambda tags: tags_treatment(tags, tags_metadata)) # filter and treat tags
df['lastModified'] = pd.to_datetime(df['lastModified']) # convert the 'lastModified' column to datetime objects
df['created_at'] = pd.to_datetime(df['created_at']) # convert the 'lastModified' column to datetime objects
df['library_name'] = df.apply(lambda row: set_library(row['library_name'], row['tags'], tags_metadata['libraries']), axis=1) # adds libraries used by model
df["datasets_size"] = df["datasets_size"].replace(0, np.nan)
df = df[df['co2_eq_emissions'] != 0]


#Feature Engineering
df['co2_reported'] = df['co2_eq_emissions'].apply(lambda x: 0 if pd.isnull(x) or x is None else 1) 
df['license'] = df['tags'].apply(lambda tags: set_license(tags, tags_metadata['licenses']))
df['language'] = df['tags'].apply(lambda tags: set_language(tags, tags_metadata['languages']))
df['domain'] = df['tags'].apply(lambda tags: assign_model_domain(tags, tags_metadata['tags_to_domain']))
df['year_month'] = df['created_at'].apply(lambda x: x.strftime('%Y-%m')) #  column 'year_month' to group the data monthly
df['size_efficency'] = df['size'] / df['co2_eq_emissions']
df['performance_score'] = performance_score(df)

# we split, one hot and then combine splits to avoid memory overflow if we one-hot'ed with the whole dataset alltogether
df1,df2,df3,df4 = split_df(df)
df1,df2,df3,df4 = one_hot_tags(df1), one_hot_tags(df2), one_hot_tags(df3), one_hot_tags(df4)
df1,df2,df3,df4 = select_top_tags(df1, 100), select_top_tags(df2, 100), select_top_tags(df3, 100), select_top_tags(df4, 100)
df = concat_dataset_splits(df1,df2,df3,df4)

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


: 

: 

In [None]:
df.to_csv('../../datasets/HFTotalProcessed.csv')

## Preprocessing of CO2 data

We join the co2 subset from the raw preprocessed data with the cleaned dataset and continue the preprocessing on the co2.

In [14]:
def read_df_processed():
    df = pd.read_csv('../../datasets/HFTotalProcessed.csv')
    df = df.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)
    df['library_name'] = df['library_name'].apply(lambda libraries:  ast.literal_eval(libraries) if not isinstance(libraries, list) else libraries)
    df['datasets'] = df['datasets'].apply(lambda datasets: [''] if pd.isnull(datasets) else [datasets] if '[' not in datasets else ast.literal_eval(datasets))

    return df

df = read_df_processed()

def read_df_clean():
    df = pd.read_csv('../../datasets/HFClean.csv')
    df = df.drop(['Unnamed: 0'], axis=1)
    df['library_name'] = df['library_name'].apply(lambda libraries:  ast.literal_eval(libraries) if not isinstance(libraries, list) else libraries)
    df['datasets'] = df['datasets'].apply(lambda datasets: [''] if pd.isnull(datasets) else [datasets] if '[' not in datasets else ast.literal_eval(datasets))

    return df

df_clean = read_df_clean()

df = df[df['co2_reported'] == True]


  df = pd.read_csv('../../datasets/HFTotalProcessed.csv')


In [15]:
wanted_columns = [col for col in df.columns if not col.startswith('is_')]
df = df[wanted_columns]

In [16]:
df1 = df
df2 = df_clean

# Merge the dataframes
merged = df1.merge(df2, on='modelId', how='left', suffixes=('', '_y'))

# Replace _x columns with _y columns (from HFClean.csv) when _y is not null
for column in merged.columns:
    if '_y' in column:
        merged[column.replace('_y', '')] = merged[column].where(merged[column].notnull(), merged[column.replace('_y', '')])

# Drop _y columns
df = merged[df1.columns]

In [17]:
def combine_sources(source, auto):

    if auto:
        return 'AutoTrain'
    if source == 'code carbon':
        return 'Code Carbon'
    if 'mlco2' in source or 'ML CO2' in source:
        return 'MLCO2'
    if 'BLOOM' in source:
        return 'Article'
    if 'Google Cloud' in source:
        return 'Google Cloud Footprint'
    
    return 'Not Specified'

def combine_location(location):

    if 'East US' in location:
        return 'East US'
    if location == 'Frankfurt an Main, Germany (500-600 gCO2eq/kWh)':
        return 'Frankfurt an Main, Germany'
    return location


def combine_training_type(training_type):

    if 'fine' in training_type:
        return 'fine-tuning'
    if 'pre' in training_type:
        return 'pretraining'
    
    return 'Not Specified'

def create_performance_metrics(row):
    return {'accuracy': row['accuracy'], 'f1': row['f1'], 'rouge1': row['rouge1'], 'rougeL': row['rougeL']}

In [18]:
df['domain'] = df['domain'].fillna('Not Specified')
df['training_type'] = df['training_type'].fillna('Not Specified')
df['source'] = df['source'].fillna('Not Specified')
df['geographical_location'] = df['geographical_location'].fillna('Not Specified')
df['hardware_used'] = df['hardware_used'].fillna('Not Specified')

df['source'] = df.apply(lambda row: combine_sources(row['source'], row['auto']), axis=1)
df['geographical_location'] = df['geographical_location'].apply(lambda location: combine_location(location))
df['training_type'] = df['training_type'].apply(lambda training_type: combine_training_type(training_type))
df['size_efficency'] = df['size'] / df['co2_eq_emissions']
df['datasets_size_efficency'] = df['datasets_size'] / df['co2_eq_emissions']
df['downloads'] = df['downloads'].astype(int)
df['likes'] = df['likes'].astype(int)
df['co2_reported'] = df['co2_reported'].astype(int)

df['created_at'] = pd.to_datetime(df['created_at'])
df['created_at'] = df['created_at'].dt.date
df['performance_metrics'] = df.apply(create_performance_metrics, axis=1)

In [19]:
df = df.rename(columns={'hardware_used': 'environment'})
wanted_columns = ['modelId', 'datasets', 'datasets_size', 'co2_eq_emissions', 'co2_reported', 'source', 'training_type', 'geographical_location', 'environment', 'performance_metrics', 'performance_score',
                  'downloads', 'likes', 'library_name', 'domain', 'size', 'created_at', 'size_efficency', 'datasets_size_efficency']
df = df[wanted_columns]

In [20]:
df.to_csv('../../datasets/HFCO2.csv', index=False)

In [None]:
dd