In [12]:
import pandas as pd
import numpy as np
from tqdm import tqdm

precision_to_presence = {4:0.95, 3:0.9, 2:0.7, 1:0.4, 0:0.1}

def add_round_data(file_path: str, precision: int, headlines: int = 20000):
    # Charger le fichier CSV
    metadata_df = pd.read_csv(file_path, delimiter =";")
    metadata_df = metadata_df.head(headlines)
    
    cols = ['surveyId', 'speciesId', 'lat', 'lon']
    metadata_df = metadata_df[cols]

    metadata_df['presence'] = 1
    # Arrondir les coordonnées
    metadata_df['lat_arrondi'] = metadata_df['lat'].round(precision)
    metadata_df['lon_arrondi'] = metadata_df['lon'].round(precision)
    
    # Fusionner le dataframe avec lui-même sur les coordonnées arrondies pour trouver les espèces proches
    fusion_df = pd.merge(metadata_df, metadata_df, on=['lat_arrondi', 'lon_arrondi'], suffixes=('', '_proche'))

    # Filtrer les lignes avec le même surveyId
    fusion_df = fusion_df[fusion_df['surveyId'] != fusion_df['surveyId_proche']]

    # Conserver uniquement les colonnes nécessaires
    fusion_df = fusion_df[['surveyId', 'lat', 'lon', 'speciesId_proche']]
    fusion_df = fusion_df.rename(columns={'speciesId_proche': 'speciesId'})

    # Identifier les lignes uniques pour éviter les doublons
    fusion_df['presence'] = precision_to_presence[precision]
    fusion_df = fusion_df.drop_duplicates(subset=['surveyId', 'speciesId'])

    # Concatenation du DataFrame original avec le DataFrame des nouvelles lignes
    metadata_df = pd.concat([metadata_df, fusion_df], ignore_index=True)

    # Éliminer les doublons pour un même surveyId et speciesId
    metadata_df = metadata_df.drop_duplicates(subset=['surveyId', 'speciesId'])

    # Trier le DataFrame par 'surveyId'
    metadata_df = metadata_df.sort_values(by='surveyId')

    # Enlever les colonnes 'lat_arrondi' et 'lon_arrondi'
    metadata_df = metadata_df.drop(columns=['lat_arrondi', 'lon_arrondi'])

    return metadata_df


def merge_rounded_datasets(df_1: pd.DataFrame, df_2: pd.DataFrame):
    # Créer des index à partir des colonnes 'surveyId' et 'speciesId' pour les deux DataFrames
    df_1_indexed = df_1.set_index(['surveyId', 'speciesId'])
    df_2_indexed = df_2.set_index(['surveyId', 'speciesId'])

    # Identifier les lignes présentes dans df_2 mais pas dans df_1
    missing_index = df_2_indexed.index.difference(df_1_indexed.index)

    # Ajouter les lignes manquantes de df_2 à df_1
    df_1_updated = pd.concat([df_1, df_2[df_2.set_index(['surveyId', 'speciesId']).index.isin(missing_index)]], ignore_index=True)
    
    df_1_updated = df_1_updated.sort_values(by='surveyId')

    return df_1_updated

def create_datasets_with_presence(metadata_path : str, headlines : int = 20000):
    
    metadata_df = pd.read_csv(metadata_path)
    
    #df_1 = add_round_data(metadata_path, 1, headlines)
    df_2 = add_round_data(metadata_path, 2, headlines)
    df_3 = add_round_data(metadata_path, 3, headlines)
    df_4 = add_round_data(metadata_path, 4, headlines)
    
    metadata_df = merge_rounded_datasets(df_4,df_3)
    metadata_df = merge_rounded_datasets(metadata_df,df_3)
    metadata_df = merge_rounded_datasets(metadata_df,df_2)
    #metadata_df = merge_rounded_datasets(metadata_df,df_1)
    
    cols = ['surveyId', 'speciesId', 'lat', 'lon','presence']
    metadata_df = metadata_df[cols]
    
    return metadata_df


def create_datasets_with_classes(metadata_path : str, headlines : int = 20000):
    
    data = create_datasets_with_presence(metadata_path, headlines)
    
    data['absence'] = 0
    data['presence_a_2_digit'] = data['presence'].apply(lambda x: 1 if x == 0.7 else 0)
    data['presence_a_3_digit'] = data['presence'].apply(lambda x: 1 if x == 0.9 else 0)
    data['presence_a_4_digit'] = data['presence'].apply(lambda x: 1 if x == 0.95 else 0)
    data['presence_seule'] = data['presence'].apply(lambda x: 1 if x == 1 else 0)
   
    
    data.drop(columns=['presence'], inplace=True)
    
    data.to_csv('metadata_with_classes_big.csv', index=False)
    
    return
    
    

#create_datasets_with_classes("/home/dakbarin/data/data/GEOLIFECLEF/GLC24_PA_metadata_train.csv", 1000000)

df_0 = pd.read_csv('metadata_with_classes_big.csv')
filtered_df = df_0[(df_0['presence_a_3_digit'] == 1)]

# # Afficher le DataFrame filtré
print(filtered_df)

         surveyId  speciesId       lat       lon  absence  presence_a_2_digit  \
227           489      469.0  47.01170  12.44991        0                   0   
228           489     7972.0  47.01170  12.44991        0                   0   
229           489     3597.0  47.01170  12.44991        0                   0   
230           489     7115.0  47.01170  12.44991        0                   0   
231           489     2564.0  47.01170  12.44991        0                   0   
...           ...        ...       ...       ...      ...                 ...   
2532228   2658606    10065.0  57.04480   8.58434        0                   0   
2532229   2658606     5386.0  57.04480   8.58434        0                   0   
2532231   2658606     7978.0  57.04480   8.58434        0                   0   
2532232   2658606    10940.0  57.04480   8.58434        0                   0   
2532611   2659215      963.0  55.12624  14.90098        0                   0   

         presence_a_3_digit

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

precision_to_presence = {4:0.95, 3:0.9, 2:0.7, 1:0.4, 0:0.1}

def add_round_data(file_path: str, precision: int, headlines: int = 20000):
    # Charger le fichier CSV
    metadata_df = pd.read_csv(file_path, delimiter =";")
    metadata_df = metadata_df.iloc[1000000:1000000+headlines]
    
    cols = ['surveyId', 'speciesId', 'lat', 'lon']
    metadata_df = metadata_df[cols]

    metadata_df['presence'] = 1
    # Arrondir les coordonnées
    metadata_df['lat_arrondi'] = metadata_df['lat'].round(precision)
    metadata_df['lon_arrondi'] = metadata_df['lon'].round(precision)
    
    # Fusionner le dataframe avec lui-même sur les coordonnées arrondies pour trouver les espèces proches
    fusion_df = pd.merge(metadata_df, metadata_df, on=['lat_arrondi', 'lon_arrondi'], suffixes=('', '_proche'))

    # Filtrer les lignes avec le même surveyId
    fusion_df = fusion_df[fusion_df['surveyId'] != fusion_df['surveyId_proche']]

    # Conserver uniquement les colonnes nécessaires
    fusion_df = fusion_df[['surveyId', 'lat', 'lon', 'speciesId_proche']]
    fusion_df = fusion_df.rename(columns={'speciesId_proche': 'speciesId'})

    # Identifier les lignes uniques pour éviter les doublons
    fusion_df['presence'] = precision_to_presence[precision]
    fusion_df = fusion_df.drop_duplicates(subset=['surveyId', 'speciesId'])

    # Concatenation du DataFrame original avec le DataFrame des nouvelles lignes
    metadata_df = pd.concat([metadata_df, fusion_df], ignore_index=True)

    # Éliminer les doublons pour un même surveyId et speciesId
    metadata_df = metadata_df.drop_duplicates(subset=['surveyId', 'speciesId'])

    # Trier le DataFrame par 'surveyId'
    metadata_df = metadata_df.sort_values(by='surveyId')

    # Enlever les colonnes 'lat_arrondi' et 'lon_arrondi'
    metadata_df = metadata_df.drop(columns=['lat_arrondi', 'lon_arrondi'])
    return metadata_df

def merge_rounded_datasets(df_1: pd.DataFrame, df_2: pd.DataFrame):
    # Créer des index à partir des colonnes 'surveyId' et 'speciesId' pour les deux DataFrames
    df_1_indexed = df_1.set_index(['surveyId', 'speciesId'])
    df_2_indexed = df_2.set_index(['surveyId', 'speciesId'])

    # Identifier les lignes présentes dans df_2 mais pas dans df_1
    missing_index = df_2_indexed.index.difference(df_1_indexed.index)

    # Ajouter les lignes manquantes de df_2 à df_1
    df_1_updated = pd.concat([df_1, df_2[df_2.set_index(['surveyId', 'speciesId']).index.isin(missing_index)]], ignore_index=True)
    
    df_1_updated = df_1_updated.sort_values(by='surveyId')

    return df_1_updated

def create_datasets_with_presence(metadata_path : str, headlines : int = 20000):
    
    metadata_df = pd.read_csv(metadata_path)
    
    #df_1 = add_round_data(metadata_path, 1, headlines)
    df_2 = add_round_data(metadata_path, 2, headlines)
    df_3 = add_round_data(metadata_path, 3, headlines)
    df_4 = add_round_data(metadata_path, 4, headlines)
    
    metadata_df = merge_rounded_datasets(df_4,df_3)
    metadata_df = merge_rounded_datasets(metadata_df,df_3)
    metadata_df = merge_rounded_datasets(metadata_df,df_2)
    #metadata_df = merge_rounded_datasets(metadata_df,df_1)
    
    cols = ['surveyId', 'speciesId', 'lat', 'lon','presence']
    metadata_df = metadata_df[cols]
    
    return metadata_df


def create_datasets_with_classes(metadata_path : str, headlines : int = 200000):
    
    data = create_datasets_with_presence(metadata_path, headlines)
    
    data['absence'] = 0
    data['presence_a_2_digit'] = data['presence'].apply(lambda x: 1 if x == 0.7 else 0)
    data['presence_a_3_digit'] = data['presence'].apply(lambda x: 1 if x == 0.9 else 0)
    data['presence_a_4_digit'] = data['presence'].apply(lambda x: 1 if x == 0.95 else 0)
    data['presence_seule'] = data['presence'].apply(lambda x: 1 if x == 1 else 0)
   
    
    data.drop(columns=['presence'], inplace=True)
    
    data.to_csv('metadata_with_classes_test.csv', index=False)
    
    return
    
    

create_datasets_with_classes("/home/dakbarin/data/data/GEOLIFECLEF/GLC24_PA_metadata_train.csv", 10000)

df_0 = pd.read_csv('metadata_with_classes_test.csv')
df_0 = pd.read_csv('metadata_with_classes_test.csv')
filtered_df = df_0[(df_0['presence_seule'] == 1)]

# # Afficher le DataFrame filtré
print(filtered_df)