In [None]:
!pip install matplotlib-venn
!pip install isodate
import requests
import pandas as pd
import isodate


API_KEY = "API key"

SEARCH_URL = "https://www.googleapis.com/youtube/v3/search"
VIDEO_DETAILS_URL = "https://www.googleapis.com/youtube/v3/videos"
channel_id = "UCbp9MyKCTEww4CxEzc_Tp0Q"  # ID de la chaîne cible

video_ids = []
next_page_token = None


while True:
    search_params = {
        'part': 'snippet',
        'channelId': channel_id,
        'maxResults': 50,
        'order': 'date',
        'type': 'video',
        'pageToken': next_page_token,
        'key': API_KEY,
    }
    response = requests.get(SEARCH_URL, params=search_params)
    search_data = response.json()

    if 'items' not in search_data:
        print("Erreur dans la récupération des vidéos :", search_data)
        break

    video_ids.extend([item['id']['videoId'] for item in search_data['items']])

    next_page_token = search_data.get('nextPageToken')
    if not next_page_token:
        break

print(f"Nombre de vidéos récupérées : {len(video_ids)}")

video_details = []

def format_duration(iso_duration):
    """Formater la durée ISO 8601 en un format lisible (MM:SS)."""
    try:
        duration = isodate.parse_duration(iso_duration)
        total_seconds = int(duration.total_seconds())
        minutes, seconds = divmod(total_seconds, 60)
        return f"{minutes:02}:{seconds:02}"
    except:
        return "N/A"

for i in range(0, len(video_ids), 50):
    video_id_chunk = ",".join(video_ids[i:i+50])
    details_params = {
        'part': 'snippet,statistics,contentDetails',
        'id': video_id_chunk,
        'key': API_KEY,
    }
    details_response = requests.get(VIDEO_DETAILS_URL, params=details_params)
    details_data = details_response.json()

    if 'items' not in details_data:
        print("Erreur dans la récupération des détails :", details_data)
        continue

    for video in details_data['items']:
        snippet = video['snippet']
        statistics = video.get('statistics', {})
        content_details = video.get('contentDetails', {})

        video_details.append({
            'Video ID': video['id'],
            'Title': snippet['title'],
            'Description': snippet['description'],
            'Published Date': snippet['publishedAt'],
            'Tags': ", ".join(snippet.get('tags', [])),
            'Views': statistics.get('viewCount', 'N/A'),
            'Likes': statistics.get('likeCount', 'N/A'),
            'Comments': statistics.get('commentCount', 'N/A'),
            'Duration': format_duration(content_details.get('duration', 'N/A')),
            'Product Placement': content_details.get('hasCustomThumbnail', False)
        })

print(f"Nombre de vidéos avec détails récupérées : {len(video_details)}")

if video_details:
    df = pd.DataFrame(video_details)
    print("Aperçu des données :", df.head())
    # df.to_csv("stokes_twins_videos.csv", index=False)
    print("Les données ont été exportées dans 'stokes_twins_videos.csv'.")
else:
    print("Aucune donnée à exporter.")


Collecting isodate
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate
Successfully installed isodate-0.7.2
Nombre de vidéos récupérées : 304
Nombre de vidéos avec détails récupérées : 304
Aperçu des données :       Video ID                                Title  \
0  8ggBYI-F2K0                  How Strong Is Tape?   
1  URW7XTpR8iM      We Hit 100,000,000 Subscribers!   
2  u1i_GTpuhUc  $1 vs $100,000,000 YouTuber Houses!   
3  P9jqFr3TvSE   World’s Most DANGEROUS Theme Park!   
4  LOxFNSvrU3g               How Strong is Tinfoil?   

                                         Description        Published Date  \
0                                       Subscribe ❤️  2024-12-14T09:51:09Z   
1                                       Subscribe ❤️  2024-12-11T18:14:44Z   
2  We toured the most expensive and fun YouTube H...  2024-12-09T17:25:10Z   
3                                       Subscri

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304 entries, 0 to 303
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Video ID           304 non-null    object
 1   Title              304 non-null    object
 2   Description        304 non-null    object
 3   Published Date     304 non-null    object
 4   Tags               304 non-null    object
 5   Views              304 non-null    object
 6   Likes              304 non-null    object
 7   Comments           304 non-null    object
 8   Duration           304 non-null    object
 9   Product Placement  304 non-null    bool  
dtypes: bool(1), object(9)
memory usage: 21.8+ KB


In [None]:
# change dtypes "Published Date", "Views", "Likes"
df_clean = df.copy()
df_clean[["Views", "Likes"]] = df_clean[["Views", "Likes"]].astype(int)
df_clean["Published Date"] = pd.to_datetime(df_clean["Published Date"])
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304 entries, 0 to 303
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   Video ID           304 non-null    object             
 1   Title              304 non-null    object             
 2   Description        304 non-null    object             
 3   Published Date     304 non-null    datetime64[ns, UTC]
 4   Tags               304 non-null    object             
 5   Views              304 non-null    int64              
 6   Likes              304 non-null    int64              
 7   Comments           304 non-null    object             
 8   Duration           304 non-null    object             
 9   Product Placement  304 non-null    bool               
dtypes: bool(1), datetime64[ns, UTC](1), int64(2), object(6)
memory usage: 21.8+ KB


In [None]:
df_clean.to_csv("stokes_twins_videos.csv", index= False)

In [None]:
# error downloading dataframe in big query
# solution: drop "Description" column
df_clean = df_clean.drop("Description", axis= 1)
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304 entries, 0 to 303
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   Video ID           304 non-null    object             
 1   Title              304 non-null    object             
 2   Published Date     304 non-null    datetime64[ns, UTC]
 3   Tags               304 non-null    object             
 4   Views              304 non-null    int64              
 5   Likes              304 non-null    int64              
 6   Comments           304 non-null    object             
 7   Duration           304 non-null    object             
 8   Product Placement  304 non-null    bool               
dtypes: bool(1), datetime64[ns, UTC](1), int64(2), object(5)
memory usage: 19.4+ KB


In [None]:
df_clean.to_csv("stokes_twins_videos.csv", index= False)