## Partie 1 : Installation et Configuration (15 min)

### 1.1 Importation des bibliothèques nécessaires

In [9]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# InfluxDB
from influxdb_client import InfluxDBClient, Point
from influxdb_client.client.write_api import SYNCHRONOUS

# Kaggle
import kagglehub

# Visualisation
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import folium
from folium.plugins import HeatMap, MarkerCluster

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

### 1.2 Configuration InfluxDB

In [4]:
# Configuration InfluxDB
INFLUX_URL = "http://influxdb2:8086" 
INFLUX_TOKEN = "admin-token"
INFLUX_ORG = "fil-A3-back-bigData"
INFLUX_BUCKET = "animal-tracking"  

# Connexion au client
client = InfluxDBClient(url=INFLUX_URL, token=INFLUX_TOKEN, org=INFLUX_ORG)

# APIs
write_api = client.write_api(write_options=SYNCHRONOUS)
query_api = client.query_api()

print("Client InfluxDB connecté")
print(f"Bucket: {INFLUX_BUCKET}")

print(client.ping())

Client InfluxDB connecté
Bucket: animal-tracking
True


## Partie 2 : Chargement et Exploration des Données

In [10]:
import os

# Download the Movebank animal tracking dataset
path = "pulkit8595/movebank-animal-tracking"
print("Downloading dataset from Kaggle...")
dataset_path = kagglehub.dataset_download(path)
print(f"Dataset downloaded to: {dataset_path}")

# List files in the dataset
print("\nFiles in dataset:")
for file in os.listdir(dataset_path):
    file_size = os.path.getsize(os.path.join(dataset_path, file)) / 1024  # KB
    print(f"  - {file} ({file_size:.2f} KB)")

Downloading dataset from Kaggle...
Downloading from https://www.kaggle.com/api/v1/datasets/download/pulkit8595/movebank-animal-tracking?dataset_version_number=1...


100%|██████████| 2.26M/2.26M [00:00<00:00, 25.8MB/s]

Extracting files...
Dataset downloaded to: /home/jovyan/.cache/kagglehub/datasets/pulkit8595/movebank-animal-tracking/versions/1

Files in dataset:
  - migration_original.csv (21766.21 KB)





In [None]:
df = pd.read_csv(os.path.join(dataset_path, "migration_original.csv"))

print("Dataset loaded into DataFrame")
print(f"DataFrame shape: {df.shape}")
df.head()

Dataset loaded into DataFrame
DataFrame shape: (89867, 15)


Unnamed: 0,event-id,visible,timestamp,location-long,location-lat,manually-marked-outlier,visible.1,sensor-type,individual-taxon-canonical-name,tag-local-identifier,individual-local-identifier,study-name,ECMWF Interim Full Daily Invariant Low Vegetation Cover,NCEP NARR SFC Vegetation at Surface,ECMWF Interim Full Daily Invariant High Vegetation Cover
0,1082620685,True,2009-05-27 14:00:00.000,24.58617,61.24783,,True,gps,Larus fuscus,91732,91732A,Navigation experiments in lesser black-backed ...,0.039229,,0.960771
1,1082620686,True,2009-05-27 20:00:00.000,24.58217,61.23267,,True,gps,Larus fuscus,91732,91732A,Navigation experiments in lesser black-backed ...,0.040803,,0.959197
2,1082620687,True,2009-05-28 05:00:00.000,24.53133,61.18833,,True,gps,Larus fuscus,91732,91732A,Navigation experiments in lesser black-backed ...,0.052201,,0.947799
3,1082620688,True,2009-05-28 08:00:00.000,24.582,61.23283,,True,gps,Larus fuscus,91732,91732A,Navigation experiments in lesser black-backed ...,0.040818,,0.959182
4,1082620689,True,2009-05-28 14:00:00.000,24.5825,61.23267,,True,gps,Larus fuscus,91732,91732A,Navigation experiments in lesser black-backed ...,0.040753,,0.959247


In [None]:
# Informations sur le dataset
print("Informations sur le dataset:")
print(df.info())
print("\n Statistiques descriptives:")
print(df.describe())
print("\n Valeurs manquantes:")
print(df.isnull().sum())

📈 Informations sur le dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89867 entries, 0 to 89866
Data columns (total 15 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   event-id                                                  89867 non-null  int64  
 1   visible                                                   89867 non-null  bool   
 2   timestamp                                                 89867 non-null  object 
 3   location-long                                             89867 non-null  float64
 4   location-lat                                              89867 non-null  float64
 5   manually-marked-outlier                                   0 non-null      float64
 6   visible.1                                                 89867 non-null  bool   
 7   sensor-type                                               89867 non-null

## Partie 3 : Nettoyage des Données

In [13]:
columns_to_drop = df.columns[df.isnull().all()].tolist()
df_clean = df.drop(columns=columns_to_drop)
print(f"Colonnes supprimées: {columns_to_drop}")
print(f"Dataset nettoyé: {df_clean.shape[0]:,} lignes × {df_clean.shape[1]} colonnes (0 valeurs manquantes)")

Colonnes supprimées: ['manually-marked-outlier', 'NCEP NARR SFC Vegetation at Surface']
Dataset nettoyé: 89,867 lignes × 13 colonnes (0 valeurs manquantes)


## Partie 4 : Insertion dans InfluxDB

In [None]:
write_api.write(
    bucket=INFLUX_BUCKET, 
    org=INFLUX_ORG, 
    record=df_clean, 
    data_frame_measurement_name='BIRD-MIGRATION',
    data_frame_tag_columns=[
        'individual-local-identifier', 
        'species', 
        'sensor-type', 
        'individual-taxon-canonical-name', 
        'tag-local-identifier', 
        'study-name',
        'ECMWF Interim Full Daily Invariant Low Vegetation Cover',
        'ECMWF Interim Full Daily Invariant Tree Cover',
    ],
)  

Index(['event-id', 'visible', 'timestamp', 'location-long', 'location-lat',
       'visible.1', 'sensor-type', 'individual-taxon-canonical-name',
       'tag-local-identifier', 'individual-local-identifier', 'study-name',
       'ECMWF Interim Full Daily Invariant Low Vegetation Cover',
       'ECMWF Interim Full Daily Invariant High Vegetation Cover'],
      dtype='object')
