In [24]:
import pandas as pd
import ast

pd.options.mode.chained_assignment = None

In [25]:
def read_df_processed():
    df = pd.read_csv('../datasets/HFTotalProcessed.csv')
    df = df.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)
    df['library_name'] = df['library_name'].apply(lambda libraries:  ast.literal_eval(libraries) if not isinstance(libraries, list) else libraries)
    df['datasets'] = df['datasets'].apply(lambda datasets: [''] if pd.isnull(datasets) else [datasets] if '[' not in datasets else ast.literal_eval(datasets))

    return df

df = read_df_processed()

def read_df_clean():
    df = pd.read_csv('../datasets/HFClean.csv')
    df = df.drop(['Unnamed: 0'], axis=1)
    df['library_name'] = df['library_name'].apply(lambda libraries:  ast.literal_eval(libraries) if not isinstance(libraries, list) else libraries)
    df['datasets'] = df['datasets'].apply(lambda datasets: [''] if pd.isnull(datasets) else [datasets] if '[' not in datasets else ast.literal_eval(datasets))

    return df

df_clean = read_df_clean()
df = df[df['co2_reported'] == True]


  df = pd.read_csv('../datasets/HFTotalProcessed.csv')


In [26]:
wanted_columns = [col for col in df.columns if not col.startswith('is_')]
df = df[wanted_columns]

In [27]:
df1 = df
df2 = df_clean

# Merge the dataframes
merged = df1.merge(df2, on='modelId', how='left', suffixes=('', '_y'))

# Replace _x columns with _y columns (from HFClean.csv) when _y is not null
for column in merged.columns:
    if '_y' in column:
        merged[column.replace('_y', '')] = merged[column].where(merged[column].notnull(), merged[column.replace('_y', '')])

# Drop _y columns
df = merged[df1.columns]

In [28]:
def combine_sources(source, auto):

    if auto:
        return 'AutoTrain'
    if source == 'code carbon':
        return 'Code Carbon'
    if 'mlco2' in source or 'ML CO2' in source:
        return 'MLCO2'
    if 'BLOOM' in source:
        return 'Article'
    if 'Google Cloud' in source:
        return 'Google Cloud Footprint'
    
    return 'Not Specified'

def combine_location(location):

    if 'East US' in location:
        return 'East US'
    if location == 'Frankfurt an Main, Germany (500-600 gCO2eq/kWh)':
        return 'Frankfurt an Main, Germany'
    return location



def combine_training_type(training_type):

    if 'fine' in training_type:
        return 'fine-tuning'
    if 'pre' in training_type:
        return 'pretraining'
    
    return 'Not Specified'

def create_performance_metrics(row):
    return {'accuracy': row['accuracy'], 'f1': row['f1'], 'rouge1': row['rouge1'], 'rougeL': row['rougeL']}



In [29]:
df['domain'] = df['domain'].fillna('Not Specified')
df['training_type'] = df['training_type'].fillna('Not Specified')
df['source'] = df['source'].fillna('Not Specified')
df['geographical_location'] = df['geographical_location'].fillna('Not Specified')
df['hardware_used'] = df['hardware_used'].fillna('Not Specified')

df['source'] = df.apply(lambda row: combine_sources(row['source'], row['auto']), axis=1)
df['geographical_location'] = df['geographical_location'].apply(lambda location: combine_location(location))
df['training_type'] = df['training_type'].apply(lambda training_type: combine_training_type(training_type))
df['size_efficency'] = df['size'] / df['co2_eq_emissions']
df['datasets_size_efficency'] = df['datasets_size'] / df['co2_eq_emissions']
df['downloads'] = df['downloads'].astype(int)
df['likes'] = df['likes'].astype(int)
df['co2_reported'] = df['co2_reported'].astype(int)
df['created_at'] = pd.to_datetime(df['created_at'])
df['created_at'] = df['created_at'].dt.date
df['performance_metrics'] = df.apply(create_performance_metrics, axis=1)

In [30]:
df = df.rename(columns={'hardware_used': 'environment'})

Let's delete unnecessary attributes

In [31]:
wanted_columns = ['modelId', 'datasets', 'datasets_size', 'co2_eq_emissions', 'co2_reported', 'source', 'training_type', 'geographical_location', 'environment', 'performance_metrics', 'performance_score',
                  'downloads', 'likes', 'library_name', 'domain', 'size', 'created_at', 'size_efficency', 'datasets_size_efficency', 'auto']

df = df[wanted_columns]

In [32]:
df.to_csv('../datasets/HFCO2.csv', index=False)