In [1]:
import pandas as pd
import numpy as np

In [2]:
df_2025 = pd.read_csv('data/top_singles_2025.csv') 
df_2024 = pd.read_csv('data/top_singles_2024.csv')
df_2023 = pd.read_csv('data/top_singles_2023.csv')
df_2022 = pd.read_csv('data/top_singles_2022.csv')
df_2021 = pd.read_csv('data/top_singles_2021.csv')
df_2020 = pd.read_csv('data/top_singles_2020.csv')


df_dict = {
    '2025': df_2025,
    '2024': df_2024,
    '2023': df_2023,
    '2022': df_2022,
    '2021': df_2021,
    '2020': df_2020
}   

In [3]:
import lyricsgenius
import requests
import json
import os
import time
import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict, Counter
import pandas as pd

ACCESS_TOKEN = "HrgvWPoBG2OyDAi8X7V13cxk4x_mabMYgPFSWUsYpcokLC4oDbIA8Gt9-lInHUJB"
genius = lyricsgenius.Genius(ACCESS_TOKEN)
BASE_URL = "https://api.genius.com"

# ===== CACHE INTELLIGENT OPTIMIS√â =====
class OptimizedSongCache:
    def __init__(self, cache_file="song_cache_v2.json"):
        self.cache_file = cache_file
        self.cache = self.load_cache()
        self.stats = {"hits": 0, "misses": 0, "api_calls": 0}
        self.session = None
    
    def load_cache(self):
        if os.path.exists(self.cache_file):
            try:
                with open(self.cache_file, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except:
                return {}
        return {}
    
    def save_cache(self):
        with open(self.cache_file, 'w', encoding='utf-8') as f:
            json.dump(self.cache, f, ensure_ascii=False, indent=2)
    
    def get_key(self, title, artist):
        # Normalisation plus agressive pour capturer plus de variantes
        import re
        title_clean = re.sub(r'[^\w\s]', '', title.lower().strip())
        artist_clean = re.sub(r'[^\w\s]', '', artist.lower().strip())
        return f"{title_clean}|{artist_clean}"
    
    def get(self, title, artist):
        key = self.get_key(title, artist)
        if key in self.cache:
            self.stats["hits"] += 1
            return self.cache[key]
        self.stats["misses"] += 1
        return None
    
    def set(self, title, artist, data):
        key = self.get_key(title, artist)
        self.cache[key] = data
        
    def print_stats(self):
        total = self.stats["hits"] + self.stats["misses"]
        hit_rate = (self.stats["hits"] / total * 100) if total > 0 else 0
        print(f"üìä Cache: {self.stats['hits']} hits, {self.stats['misses']} misses | Hit rate: {hit_rate:.1f}%")

# ===== TRAITEMENT PAR BATCH OPTIMIS√â =====
class BatchProcessor:
    def __init__(self, cache, batch_size=10, delay=0.1):
        self.cache = cache
        self.batch_size = batch_size
        self.delay = delay
        
    def prepare_batches(self, df_dict):
        """Pr√©pare les batches en priorisant les chansons uniques"""
        # 1. Collecter toutes les chansons uniques d'abord
        unique_songs = set()
        all_requests = []
        
        for annee, df in df_dict.items():
            for idx, row in df.iterrows():
                key = self.cache.get_key(row['titre'], row['artiste'])
                if key not in unique_songs and not self.cache.get(row['titre'], row['artiste']):
                    unique_songs.add(key)
                    all_requests.append((row['titre'], row['artiste'], annee, idx))
        
        print(f"üî• {len(all_requests)} requ√™tes API uniques n√©cessaires")
        
        # 2. Diviser en batches
        batches = []
        for i in range(0, len(all_requests), self.batch_size):
            batches.append(all_requests[i:i + self.batch_size])
        
        return batches
    
    def process_batch(self, batch):
        """Traite un batch de requ√™tes"""
        results = []
        for title, artist, annee, idx in batch:
            try:
                data = get_song_details_fast(title, artist)
                self.cache.set(title, artist, data)
                results.append((title, artist, data))
                time.sleep(self.delay)  # Rate limiting l√©ger
            except Exception as e:
                print(f"‚ùå Erreur batch pour {title} - {artist}: {e}")
                results.append((title, artist, {}))
        return results

# ===== FONCTION API OPTIMIS√âE =====
def get_song_details_fast(title, artist):
    """Version optimis√©e avec moins de champs pour la vitesse"""
    song_data = {
        "producer_1": None, "producer_2": None,
        "writer_1": None, "writer_2": None,
        "release_date": None,
        "sample_type": None,
        "sample_from": None
    }

    try:
        # Recherche plus rapide avec timeout
        song = genius.search_song(title, artist)
        if not song:
            return song_data

        song_id = song.to_dict()['id']
        headers = {"Authorization": f"Bearer {ACCESS_TOKEN}"}
        
        # Timeout plus court pour √©viter les blocages
        r = requests.get(f"{BASE_URL}/songs/{song_id}", headers=headers, timeout=5)
        
        if r.status_code != 200:
            return song_data

        s = r.json()["response"]["song"]

        # Seulement les infos essentielles
        producers = [p["name"] for p in s.get("producer_artists", [])][:2]
        for i, prod in enumerate(producers, 1):
            song_data[f"producer_{i}"] = prod

        writers = [w["name"] for w in s.get("writer_artists", [])][:2]
        for i, writer in enumerate(writers, 1):
            song_data[f"writer_{i}"] = writer

        song_data["release_date"] = s.get("release_date")

        # Samples (version simplifi√©e)
        for rel in s.get("song_relationships", []):
            rel_type = rel.get("relationship_type", "").lower()
            songs = rel.get("songs", [])
            
            if songs and ("sample" in rel_type or "interpolat" in rel_type):
                sample_song = songs[0]
                title_s = sample_song.get("title", "")
                artist_s = sample_song.get("primary_artist", {}).get("name", "")
                song_data["sample_type"] = "sample" if "sample" in rel_type else "interpolation"
                song_data["sample_from"] = f"{title_s} - {artist_s}" if artist_s else title_s
                break

    except Exception as e:
        print(f"‚ö° Erreur rapide pour {title} - {artist}: {e}")

    return song_data

# ===== TRAITEMENT ULTRA-OPTIMIS√â =====
def ultra_fast_processing():
    cache = OptimizedSongCache()
    
    print("üî• ANALYSE PR√âLIMINAIRE")
    print("=" * 50)
    
    # Analyse des doublons am√©lior√©e
    all_songs = []
    song_to_rows = defaultdict(list)
    
    for annee, df in df_dict.items():
        for idx, row in df.iterrows():
            key = cache.get_key(row['titre'], row['artiste'])
            all_songs.append(key)
            song_to_rows[key].append((annee, idx, row['titre'], row['artiste']))
    
    song_counts = Counter(all_songs)
    total_songs = len(all_songs)
    unique_songs = len(song_counts)
    duplicates = total_songs - unique_songs
    
    print(f"üìä Total: {total_songs} | Uniques: {unique_songs} | Doublons: {duplicates}")
    print(f"üí° Gain th√©orique: {duplicates/total_songs*100:.1f}% de requ√™tes √©vit√©es")
    
    # Cache existant
    cached_count = sum(1 for song in song_counts.keys() if song in cache.cache)
    print(f"üíæ D√©j√† en cache: {cached_count}/{unique_songs} ({cached_count/unique_songs*100:.1f}%)")
    
    print(f"\nüöÄ TRAITEMENT PAR BATCH")
    print("=" * 50)
    
    # Traitement par batch des uniques seulement
    processor = BatchProcessor(cache, batch_size=5, delay=0.2)
    batches = processor.prepare_batches(df_dict)
    
    if batches:
        print(f"‚ö° Traitement de {len(batches)} batches...")
        for i, batch in enumerate(batches):
            print(f"   Batch {i+1}/{len(batches)}: {len(batch)} requ√™tes")
            processor.process_batch(batch)
            cache.print_stats()
            
            # Sauvegarde interm√©diaire tous les 10 batches
            if (i + 1) % 10 == 0:
                cache.save_cache()
                print("   üíæ Cache sauvegard√©")
    
    print(f"\nüìã APPLICATION DES DONN√âES")
    print("=" * 50)
    
    # Application ultra-rapide depuis le cache
    for annee, df in df_dict.items():
        print(f"üìÖ Ann√©e {annee}...")
        for idx, row in df.iterrows():
            cached_data = cache.get(row['titre'], row['artiste'])
            if cached_data:
                for col, val in cached_data.items():
                    df.at[idx, col] = val
        print(f"‚úÖ {len(df)} lignes mises √† jour")
    
    # Sauvegarde finale
    cache.save_cache()
    cache.print_stats()
    
    print(f"\nüéä TERMIN√â!")
    print(f"üìà Total API calls: {cache.stats['api_calls']}")
    print(f"üíæ Cache final: {len(cache.cache)} entr√©es")
    
    return df_dict

# ===== LANCEMENT =====
print("üéØ TRAITEMENT ULTRA-OPTIMIS√â")
print("="*60)

start_time = time.time()
result = ultra_fast_processing()
end_time = time.time()

print(f"\n‚è±Ô∏è  Temps total: {end_time - start_time:.2f} secondes")
print(f"üèÜ Vitesse moyenne: {len(df_2025)/((end_time - start_time)/60):.0f} chansons/minute")

# Exemple de r√©sultat
df_2025

üéØ TRAITEMENT ULTRA-OPTIMIS√â
üî• ANALYSE PR√âLIMINAIRE
üìä Total: 58400 | Uniques: 6316 | Doublons: 52084
üí° Gain th√©orique: 89.2% de requ√™tes √©vit√©es
üíæ D√©j√† en cache: 6316/6316 (100.0%)

üöÄ TRAITEMENT PAR BATCH
üî• 0 requ√™tes API uniques n√©cessaires

üìã APPLICATION DES DONN√âES
üìÖ Ann√©e 2025...
‚úÖ 6400 lignes mises √† jour
üìÖ Ann√©e 2024...
‚úÖ 10400 lignes mises √† jour
üìÖ Ann√©e 2023...
‚úÖ 10400 lignes mises √† jour
üìÖ Ann√©e 2022...
‚úÖ 10400 lignes mises √† jour
üìÖ Ann√©e 2021...
‚úÖ 10400 lignes mises √† jour
üìÖ Ann√©e 2020...
‚úÖ 10400 lignes mises √† jour
üìä Cache: 116800 hits, 0 misses | Hit rate: 100.0%

üéä TERMIN√â!
üìà Total API calls: 0
üíæ Cache final: 6316 entr√©es

‚è±Ô∏è  Temps total: 3.54 secondes
üèÜ Vitesse moyenne: 108574 chansons/minute


Unnamed: 0,classement,artiste,artiste_2,artiste_3,artiste_4,titre,editeur,annee,semaine,producer_1,producer_2,writer_1,writer_2,release_date,sample_type,sample_from
0,1,GAZO,,,,NANANI NANANA,BSB PROD,2025,1,Flem,Boya Blunt,Gazo,,2024-11-29,,
1,4,GIMS,,,,SOIS PAS TIMIDE,BELIEVE / PLAY TWO,2025,1,Young Bouba,,GIMS,,2024-07-19,,
2,7,ROS√â,BRUNO MARS,,,APT.,WEA / ATLANTIC RECORDS,2025,1,Bruno Mars,Cirkut,ROS√â,Amy Allen,2024-10-18,sample,NIGHT CLUB - PAGAGASSA
3,8,GIMS,DYSTINCT,,,SPIDER,BELIEVE / PLAY TWO,2025,1,Maximum Beats,Young Bouba,GIMS,DYSTINCT,2024-05-10,,
4,6,DAMSO,KALASH,,,ALPHA,TRENTE-QUATRE CENTIMES,2025,1,Shaz (FRA),Mikado972,Damso,Kalash,2024-11-15,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6395,171,IMAGINE DRAGONS,,,,BELIEVER,POLYDOR / INTERSCOPE RECORDS,2025,32,Mattman & Robin,,Dan Reynolds,Wayne Sermon,2017-02-01,sample,Novo Dono dos Miraculous - Venum Beats
6396,169,GUY2BEZBAR,,,,LA JEUNESSE DOR√âE,SONY MUSIC ENTERTAINMENT / BLUE MAGIC CORP,2025,32,Yung Than,,Guy2Bezbar,Killa Predator,2025-06-26,,
6397,198,W SOUND,BE√âLE,OVY ON THE DRUMS,,LA PLENA,WEA / WEA LATINA,2025,32,Ovy On The Drums,,Be√©le,The KristoMan,2025-02-19,,
6398,191,LORD HURON,PHOEBE BRIDGERS,,,THE NIGHT WE MET,PIAS FRANCE / PLAY IT AGAIN SAM,2025,32,Ben Schneider,,Ben Schneider,,2015-02-09,sample,Fallen Vulture - ‚Äãevntura


In [4]:
df_2025

Unnamed: 0,classement,artiste,artiste_2,artiste_3,artiste_4,titre,editeur,annee,semaine,producer_1,producer_2,writer_1,writer_2,release_date,sample_type,sample_from
0,1,GAZO,,,,NANANI NANANA,BSB PROD,2025,1,Flem,Boya Blunt,Gazo,,2024-11-29,,
1,4,GIMS,,,,SOIS PAS TIMIDE,BELIEVE / PLAY TWO,2025,1,Young Bouba,,GIMS,,2024-07-19,,
2,7,ROS√â,BRUNO MARS,,,APT.,WEA / ATLANTIC RECORDS,2025,1,Bruno Mars,Cirkut,ROS√â,Amy Allen,2024-10-18,sample,NIGHT CLUB - PAGAGASSA
3,8,GIMS,DYSTINCT,,,SPIDER,BELIEVE / PLAY TWO,2025,1,Maximum Beats,Young Bouba,GIMS,DYSTINCT,2024-05-10,,
4,6,DAMSO,KALASH,,,ALPHA,TRENTE-QUATRE CENTIMES,2025,1,Shaz (FRA),Mikado972,Damso,Kalash,2024-11-15,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6395,171,IMAGINE DRAGONS,,,,BELIEVER,POLYDOR / INTERSCOPE RECORDS,2025,32,Mattman & Robin,,Dan Reynolds,Wayne Sermon,2017-02-01,sample,Novo Dono dos Miraculous - Venum Beats
6396,169,GUY2BEZBAR,,,,LA JEUNESSE DOR√âE,SONY MUSIC ENTERTAINMENT / BLUE MAGIC CORP,2025,32,Yung Than,,Guy2Bezbar,Killa Predator,2025-06-26,,
6397,198,W SOUND,BE√âLE,OVY ON THE DRUMS,,LA PLENA,WEA / WEA LATINA,2025,32,Ovy On The Drums,,Be√©le,The KristoMan,2025-02-19,,
6398,191,LORD HURON,PHOEBE BRIDGERS,,,THE NIGHT WE MET,PIAS FRANCE / PLAY IT AGAIN SAM,2025,32,Ben Schneider,,Ben Schneider,,2015-02-09,sample,Fallen Vulture - ‚Äãevntura


In [5]:
df_2025.columns

Index(['classement', 'artiste', 'artiste_2', 'artiste_3', 'artiste_4', 'titre',
       'editeur', 'annee', 'semaine', 'producer_1', 'producer_2', 'writer_1',
       'writer_2', 'release_date', 'sample_type', 'sample_from'],
      dtype='object')