#The dataset consists of two different types of files: compressed CSV
#files containing video-level data and JSON files containing category
#metadata. Only the CSV files were concatenated into a single DataFrame,
#as required by the assignment. The JSON files were read separately and
#used later for category mapping and validation.


import pandas as pd
import numpy as np
import glob


In [7]:
#The dataset consists of two different types of files: compressed CSV
#files containing video-level data and JSON files containing category
#metadata. Only the CSV files were concatenated into a single DataFrame,
#as required by the assignment. The JSON files were read separately and
#used later for category mapping and validation.


import pandas as pd
import numpy as np
import glob


In [8]:
#This cell retrieves the list of all compressed CSV files containing
#YouTube trending videos data for different countries.
                            
csv_files = glob.glob("trendingYT/*videos.csv.zst")
csv_files


['trendingYT/USvideos.csv.zst',
 'trendingYT/FRvideos.csv.zst',
 'trendingYT/MXvideos.csv.zst',
 'trendingYT/INvideos.csv.zst',
 'trendingYT/KRvideos.csv.zst',
 'trendingYT/JPvideos.csv.zst',
 'trendingYT/DEvideos.csv.zst',
 'trendingYT/CAvideos.csv.zst',
 'trendingYT/RUvideos.csv.zst',
 'trendingYT/GBvideos.csv.zst']

In [9]:
#This cell reads a single compressed CSV file corresponding to the
#United States (`USvideos.csv.zst`) using Pandas. The purpose of this
#step is to verify that the `.csv.zst` format can be correctly read
#within the Jupyter Notebook environment and to inspect the structure
#of the dataset, including columns and sample rows.

df_test = pd.read_csv("trendingYT/USvideos.csv.zst", compression="zstd")
df_test.head()


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


In [10]:
#This cell reads all country-specific compressed CSV files containing
#YouTube trending videos data. For each file, the country code is
#extracted from the file name and added as a new column called `country`.
#All individual DataFrames are then concatenated into a single unified
#DataFrame named `data`. This allows subsequent analyses to be performed
#across all countries while preserving the origin of each record.

dfs = []

for file in csv_files:
    country = file.split("/")[-1][:2]
    
    df = pd.read_csv(
        file,
        compression="zstd",
        encoding="latin1"
    )
    
    df["country"] = country
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)


In [11]:
data.shape
data.head(5)


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,country
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,US
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John...",US
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO â¶ \n\nSUBSCRIBE âº ...,US
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...,US
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...,US


In [12]:
data["country"].unique()


array(['US', 'FR', 'MX', 'IN', 'KR', 'JP', 'DE', 'CA', 'RU', 'GB'],
      dtype=object)

In [13]:
data["country"].value_counts()


country
US    40949
CA    40881
DE    40840
RU    40739
FR    40724
MX    40451
GB    38916
IN    37352
KR    34567
JP    20523
Name: count, dtype: int64

In [14]:
data.sample(10)[["video_id", "country"]]


Unnamed: 0,video_id,country
113304,MH6WWadfinY,MX
92734,ASRKNMTYFvY,MX
273220,2YnVLR2OnSc,CA
346951,B-t4bU3-lHA,GB
5000,9v_rtaye2yY,US
362833,1SnWTW6YXbE,GB
356334,aMGQgC5yV-o,GB
349984,vEhoc_zGeN0,GB
60710,gwn65SVDiLU,FR
313935,PzlUeMPBczo,RU


In [15]:
type(data)


pandas.core.frame.DataFrame

## Data Cleaning and Preprocessing

Before performing any analysis, the dataset was cleaned to ensure
consistency, correctness, and reliability of the results. The cleaning
process addressed missing values, inconsistent data types, formatting
issues in date and time columns, and potential division-by-zero errors
in engagement metrics.

## Data Cleaning and Preprocessing

Before performing any analysis, the dataset was cleaned to ensure
consistency, correctness, and reliability of the results. The cleaning
process addressed missing values, inconsistent data types, formatting
issues in date and time columns, and potential division-by-zero errors
in engagement metrics.

In [16]:
#This cell checks for missing values in each column of the dataset.
#Identifying missing values is a necessary first step before deciding
#how to handle them.
data.isna().sum()


video_id                      0
trending_date                 0
title                         0
channel_title                 0
category_id                   0
publish_time                  0
tags                          0
views                         0
likes                         0
dislikes                      0
comment_count                 0
thumbnail_link                0
comments_disabled             0
ratings_disabled              0
video_error_or_removed        0
description               19478
country                       0
dtype: int64

In [17]:
#Question 2
#Extract all videos that have no tag
videos_without_tags = data[
    (data["tags"].isna()) | (data["tags"] == "[none]")
]


In [18]:
videos_without_tags.shape


(37698, 17)

In [19]:
videos_without_tags.head()  


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,country
42,NZFhMSgbKKM,17.14.11,Dennis Smith Jr. and LeBron James go back and ...,Ben Rohrbach,17,2017-11-13T15:11:00.000Z,[none],945,7,5,8,https://i.ytimg.com/vi/NZFhMSgbKKM/default.jpg,False,False,False,,US
97,xfmipNU4Odc,17.14.11,Edna's registered owner thought she was dead f...,Hope For Paws - Official Rescue Channel,15,2017-11-10T18:02:04.000Z,[none],284666,16396,81,949,https://i.ytimg.com/vi/xfmipNU4Odc/default.jpg,False,False,False,Please donate $5 and help us save more lives:\...,US
133,X7flefV8tec,17.14.11,"President Bill Clinton On Dictators, Democracy...",Team Coco,24,2017-11-09T02:37:49.000Z,[none],366180,4364,4448,1997,https://i.ytimg.com/vi/X7flefV8tec/default.jpg,False,False,False,#ConanNYC Highlight: President Clinton talks a...,US
136,5x1FAiIq_pQ,17.14.11,Alicia Keys - When You Were Gone,Alicia Keys,10,2017-11-09T15:49:21.000Z,[none],95944,1354,181,117,https://i.ytimg.com/vi/5x1FAiIq_pQ/default.jpg,False,False,False,Find out more in The Vault: http://bit.ly/AK_A...,US
175,4d07RXYLsJE,17.14.11,Sphaera - demonstrating interaction,Jenny Hanell,28,2017-11-04T20:48:16.000Z,[none],1827,3,0,2,https://i.ytimg.com/vi/4d07RXYLsJE/default.jpg,False,False,False,,US


In [20]:
#Question 3: For each channel, determine the total number of views
views_per_channel = (
    data
    .groupby("channel_title")["views"]
    .sum()
    .reset_index()
)


In [21]:
views_per_channel.shape


(37824, 2)

In [49]:
views_per_channel.sort_values("views", ascending=False).head(100)


Unnamed: 0,channel_title,views
4564,ChildishGambinoVEVO,11016766510
15536,Marvel Entertainment,10430605449
17726,NickyJamTV,9479859505
18466,Ozuna,8623329509
28412,ibighit,8205572221
...,...,...
3762,BuzzFeedVideo,1010981583
4792,Clash Royale,999776208
4471,Charlie Puth,991674812
2790,Bad Lip Reading,961487305


In [23]:
views_per_channel[
    views_per_channel["channel_title"] == "Marvel Entertainment"
]

Unnamed: 0,channel_title,views
15536,Marvel Entertainment,10430605449


In [24]:
#This cell identifies videos with disabled comments, disabled ratings,
#or videos that are unavailable due to errors or removal. These rows are
#stored in a separate DataFrame named `excluded` and removed from the
#main dataset to ensure that subsequent analyses are performed only on
#valid and fully interactive videos.

excluded = data[
    (data["comments_disabled"] == True) |
    (data["ratings_disabled"] == True) |
    (data["video_error_or_removed"] == True)
]

data = data.drop(excluded.index)

In [25]:
excluded.shape


(13657, 17)

In [26]:
data.shape


(362285, 17)

In [27]:
data[
    (data["comments_disabled"]) |
    (data["ratings_disabled"]) |
    (data["video_error_or_removed"])
].shape


(0, 17)

In [28]:
#A new column `like_ratio` was added to the dataset, representing the
#ratio between the number of likes and dislikes. To avoid division by
#zero, the ratio is computed only when the number of dislikes is greater
#than zero; otherwise, the value is set to NaN.

data["like_ratio"] = np.where(
    data["dislikes"] > 0,
    data["likes"] / data["dislikes"],
    np.nan
)


In [29]:
data.columns


Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'video_error_or_removed', 'description', 'country', 'like_ratio'],
      dtype='object')

In [48]:
data[["likes", "dislikes", "like_ratio"]].head()


Unnamed: 0,likes,dislikes,like_ratio
0,57527,2966,19.395482
1,97185,6146,15.812724
2,146033,5339,27.352126
3,10172,666,15.273273
4,132235,1989,66.483157


In [31]:
#A total of 2,390 videos have undefined like-to-dislike ratios due to
#having zero dislikes. These cases were handled by assigning NaN values
#to avoid division by zero and misleading infinite ratios.

data["like_ratio"].isna().sum()


np.int64(2390)

In [32]:
#Question 6
#Cluster the publish time into 10-minute intervals (e.g. from 02:20 to 02:30)
#Step 1: publish_time ko datetime mein convert karo
data["publish_time"] = pd.to_datetime(data["publish_time"])
#Step 2: 10-minute interval create karo
data["publish_interval"] = data["publish_time"].dt.floor("10min")
#Interval ko readable banana (optional but nice)
data["publish_interval"] = data["publish_interval"].dt.strftime("%H:%M")

In [33]:
data[["publish_time", "publish_interval"]].head(20)


Unnamed: 0,publish_time,publish_interval
0,2017-11-13 17:13:01+00:00,17:10
1,2017-11-13 07:30:00+00:00,07:30
2,2017-11-12 19:05:24+00:00,19:00
3,2017-11-13 11:00:04+00:00,11:00
4,2017-11-12 18:01:41+00:00,18:00
5,2017-11-13 19:07:23+00:00,19:00
6,2017-11-12 05:37:17+00:00,05:30
7,2017-11-12 21:50:37+00:00,21:50
8,2017-11-13 14:00:23+00:00,14:00
9,2017-11-13 13:45:16+00:00,13:40


In [34]:
#Question 7
#For each interval, determine the number of videos, average number of likes and of dislikes

#This cell groups videos by their 10-minute publish intervals and
#computes the number of videos, along with the average number of likes
#and dislikes for each interval.

interval_stats = (
    data
    .groupby("publish_interval")
    .agg(
        video_count=("video_id", "count"),
        avg_likes=("likes", "mean"),
        avg_dislikes=("dislikes", "mean")
    )
    .reset_index()
)

In [35]:
interval_stats.head()


Unnamed: 0,publish_interval,video_count,avg_likes,avg_dislikes
0,00:00,2826,62819.249823,3902.800425
1,00:10,1424,20898.900281,1448.770365
2,00:20,1199,22120.714762,1109.57548
3,00:30,1557,38038.324342,981.208735
4,00:40,1217,43956.824979,1988.88332


In [36]:
interval_stats.sort_values("video_count", ascending=False).head()


Unnamed: 0,publish_interval,video_count,avg_likes,avg_dislikes
96,16:00,12366,57097.226589,2312.168688
102,17:00,12239,37159.159735,2085.544734
90,15:00,10492,88187.461971,2153.468262
84,14:00,8356,74172.2354,3420.726185
78,13:00,7304,49942.799151,1763.955367


In [37]:
#Question 8
#For each tag, determine the number of videos

#Step 1: [none] Remove tags 
tags_df = data[data["tags"] != "[none]"].copy()
#Step 2: Split the tags into list
tags_df["tags"] = tags_df["tags"].str.split("|")
#Step 3: Make a separate row for each tag
tags_df = tags_df.explode("tags")
#Step 4: Count videos per tag
tag_counts = (
    tags_df
    .groupby("tags")["video_id"]
    .count()
    .reset_index(name="video_count")
)


#Videos with no tags were excluded from this analysis to avoid
#miscounting non-informative tag values.

In [38]:
tag_counts.sort_values("video_count", ascending=False).head(100)

Unnamed: 0,tags,video_count
336367,"""funny""",14834
277331,"""comedy""",11900
12345,"""2018""",10567
443877,"""news""",5653
436001,"""music""",5544
...,...,...
663875,"""Ð½Ð¾Ð²Ð¾ÑÑÐ¸""",1580
544993,"""top""",1569
388715,"""kids""",1559
322868,"""fail""",1508


In [39]:
tag_counts.shape

(868445, 2)

In [40]:
#Question 9
#Find the tags with the largest number of videos

#This cell identifies the most frequently used tags by sorting the tag
#counts in descending order and selecting the top results.
top_tags = tag_counts.sort_values("video_count", ascending=False)
top_tags.head(15)


Unnamed: 0,tags,video_count
336367,"""funny""",14834
277331,"""comedy""",11900
12345,"""2018""",10567
443877,"""news""",5653
436001,"""music""",5544
560941,"""video""",5338
11561,"""2017""",5334
363385,"""humor""",4992
535015,"""television""",4099
490027,"""review""",4006


In [41]:
#Question 10
#For each (tag, country) pair, compute average ratio likes/dislikes


#To compute the average like-to-dislike ratio for each (tag, country)
#pair, videos without tags were removed and the tags column was split
#into individual tags. Each tag was expanded into its own row, allowing
#grouping by both tag and country. The average of the previously computed
#like_ratio was then calculated for each group.


import re

# Step 1: Remove rows without valid tags at video level
tags_country_df = data[
    (data["tags"].notna()) &
    (data["tags"] != "[none]")
].copy()

# Step 2: Split tags
tags_country_df["tags"] = tags_country_df["tags"].str.split("|")

# Step 3: Explode
tags_country_df = tags_country_df.explode("tags")

# Step 4: Strip whitespace
tags_country_df["tags"] = tags_country_df["tags"].str.strip()

# Step 5: Keep ONLY clean alphanumeric tags (no spaces, no symbols)
tags_country_df = tags_country_df[
    tags_country_df["tags"].str.match(r"^[A-Za-z0-9]+$")
]

# Step 6: Remove undefined like ratios
tags_country_df = tags_country_df[tags_country_df["like_ratio"].notna()]

# Step 7: Group by tag and country
tag_country_ratio = (
    tags_country_df
    .groupby(["tags", "country"])["like_ratio"]
    .mean()
    .reset_index(name="avg_like_ratio")
)

In [42]:
tag_country_ratio.head()



Unnamed: 0,tags,country,avg_like_ratio
0,0014catorce,JP,16.772325
1,01ka0,JP,40.688889
2,08282016NtflxUSCAN,CA,57.816643
3,08282016NtflxUSCAN,DE,46.431718
4,08282016NtflxUSCAN,GB,73.495884


In [43]:
tag_country_ratio.shape


(15562, 3)

In [44]:
#Question 11
#For each (trending_date, country) pair, find the video with the largest number of views

#For each combination of trending date and country, the video with the
#highest number of views was identified. This was achieved by grouping
#the data by trending_date and country, finding the index of the maximum
#number of views in each group, and selecting the corresponding rows.


# Step 1: Ensure views is numeric
data["views"] = pd.to_numeric(data["views"], errors="coerce")

# Step 2: Find max views per (trending_date, country)
idx = (
    data
    .groupby(["trending_date", "country"])["views"]
    .idxmax()
)

# Step 3: Select those rows
top_videos_per_day_country = data.loc[idx].reset_index(drop=True)

top_videos_per_day_country.head()


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,country,like_ratio,publish_interval
0,6ZfuNTqbHE8,17.01.12,Marvel Studios' Avengers: Infinity War Officia...,Marvel Entertainment,24,2017-11-29 13:26:24+00:00,"marvel|""comics""|""comic books""|""nerdy""|""geeky""|...",56367282,2157741,34078,303178,https://i.ytimg.com/vi/6ZfuNTqbHE8/default.jpg,False,False,False,There was an ideaâ¦ Avengers: Infinity War. I...,CA,63.317712,13:20
1,6ZfuNTqbHE8,17.01.12,Marvel Studios' Avengers: Infinity War Officia...,Marvel Entertainment,24,2017-11-29 13:26:24+00:00,"marvel|""comics""|""comic books""|""nerdy""|""geeky""|...",56367282,2157737,34077,303178,https://i.ytimg.com/vi/6ZfuNTqbHE8/default.jpg,False,False,False,There was an ideaâ¦ Avengers: Infinity War. I...,DE,63.319453,13:20
2,3VbHg5fqBYw,17.01.12,Avengers: Infinity War Trailer Tease,Marvel Entertainment,24,2017-11-28 17:09:22+00:00,"marvel""|""comics""|""comic books""|""nerdy""|""geeky""...",7281189,180808,19955,21244,https://i.ytimg.com/vi/3VbHg5fqBYw/default.jpg,False,False,False,Thank you to the best fans in the universe! Ma...,FR,9.060787,17:00
3,TyHvyGVs42U,17.01.12,"Luis Fonsi, Demi Lovato - Ãchame La Culpa",LuisFonsiVEVO,10,2017-11-17 05:00:01+00:00,"Luis|""Fonsi""|""Demi""|""Lovato""|""Ãchame""|""La""|""C...",143408235,2686169,137938,144217,https://i.ytimg.com/vi/TyHvyGVs42U/default.jpg,False,False,False,âÃchame La Culpaâ disponible ya en todas ...,GB,19.473742,05:00
4,6ZfuNTqbHE8,17.01.12,Marvel Studios' Avengers: Infinity War Officia...,Marvel Entertainment,24,2017-11-29 13:26:24+00:00,"marvel|""comics""|""comic books""|""nerdy""|""geeky""|...",56367282,2157733,34077,303178,https://i.ytimg.com/vi/6ZfuNTqbHE8/default.jpg,False,False,False,There was an ideaâ¦ Avengers: Infinity War. I...,IN,63.319336,13:20


In [45]:
top_videos_per_day_country.shape


(1967, 19)

In [99]:
top_videos_per_day_country[
    ["trending_date", "country", "title", "channel_title", "views"]
].head(10)

Unnamed: 0,trending_date,country,title,channel_title,views
0,17.01.12,CA,Marvel Studios' Avengers: Infinity War Officia...,Marvel Entertainment,56367282
1,17.01.12,DE,Marvel Studios' Avengers: Infinity War Officia...,Marvel Entertainment,56367282
2,17.01.12,FR,Avengers: Infinity War Trailer Tease,Marvel Entertainment,7281189
3,17.01.12,GB,"Luis Fonsi, Demi Lovato - Ãchame La Culpa",LuisFonsiVEVO,143408235
4,17.01.12,IN,Marvel Studios' Avengers: Infinity War Officia...,Marvel Entertainment,56367282
5,17.01.12,KR,Marvel Studios' Avengers: Infinity War Officia...,Marvel Entertainment,56370607
6,17.01.12,MX,Marvel Studios' Avengers: Infinity War Officia...,Marvel Entertainment,56370607
7,17.01.12,RU,Avengers: Infinity War Trailer Tease,Marvel Entertainment,7281189
8,17.01.12,US,Marvel Studios' Avengers: Infinity War Officia...,Marvel Entertainment,56367282
9,17.02.12,CA,Marvel Studios' Avengers: Infinity War Officia...,Marvel Entertainment,66637636


In [145]:
#Question 12
# Convert trending_date using the correct YouTube dataset format
data["trending_date"] = pd.to_datetime(
    data["trending_date"],
    format="%y.%d.%m",
    errors="coerce"
)

# Extract year, month, day
data["year"] = data["trending_date"].dt.year
data["month"] = data["trending_date"].dt.month
data["day"] = data["trending_date"].dt.day

# Verify
data[["trending_date", "year", "month", "day"]].head()


Unnamed: 0,trending_date,year,month,day
0,2017-11-14,2017,11,14
1,2017-11-14,2017,11,14
2,2017-11-14,2017,11,14
3,2017-11-14,2017,11,14
4,2017-11-14,2017,11,14


In [148]:
#Question 13
#For each (month, country) pair, find the video with the largest number of views

#For each combination of month and country, the video with the highest
#number of views was identified. This was done by grouping the dataset by
#month and country, finding the index of the maximum number of views in
#each group, and selecting the corresponding video records.
    
# Step 1: Ensure views is numeric
data["views"] = pd.to_numeric(data["views"], errors="coerce")

# Step 2: Find index of max views per (month, country)
idx = (
    data
    .groupby(["month", "country"])["views"]
    .idxmax()
)

# Step 3: Select those rows
top_videos_per_month_country = data.loc[idx].reset_index(drop=True)

# Check result
top_videos_per_month_country[
    ["month", "country", "title", "channel_title", "views"]
].head(20)


Unnamed: 0,month,country,title,channel_title,views
0,1,CA,Bruno Mars - Finesse (Remix) [Feat. Cardi B] [...,Bruno Mars,43067983
1,1,DE,Bruno Mars - Finesse (Remix) [Feat. Cardi B] [...,Bruno Mars,37728802
2,1,FR,Bruno Mars - Finesse (Remix) [Feat. Cardi B] [...,Bruno Mars,37728802
3,1,GB,Bruno Mars - Finesse (Remix) [Feat. Cardi B] [...,Bruno Mars,90598955
4,1,IN,"Taylor Swift - End Game ft. Ed Sheeran, Future",TaylorSwiftVEVO,42019590
5,1,KR,Bruno Mars - Finesse (Remix) [Feat. Cardi B] [...,Bruno Mars,37728802
6,1,MX,Bruno Mars - Finesse (Remix) [Feat. Cardi B] [...,Bruno Mars,31680160
7,1,RU,"Taylor Swift - End Game ft. Ed Sheeran, Future",TaylorSwiftVEVO,23198594
8,1,US,Bruno Mars - Finesse (Remix) [Feat. Cardi B] [...,Bruno Mars,57951412
9,2,CA,Drake - Godâs Plan,DrakeVEVO,47362934


In [149]:
top_videos_per_month_country.shape


(77, 22)

In [152]:
top_videos_per_month_country.sample(10)



Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,...,comments_disabled,ratings_disabled,video_error_or_removed,description,country,like_ratio,publish_interval,year,month,day
57,aJOTlE1K90k,2018-06-02,Maroon 5 - Girls Like You ft. Cardi B,Maroon5VEVO,10,2018-05-31 04:00:00+00:00,"Maroon|""Girls""|""Like""|""You""|""Interscope""|""Reco...",17303018,1207701,17464,...,False,False,False,Girls Like Youâ is out now.http://smarturl.i...,RU,69.153745,04:00,2018,6,2
71,FlsCjmMhFmw,2017-12-24,YouTube Rewind: The Shape of 2017 | #YouTubeRe...,YouTube Spotlight,24,2017-12-06 17:58:51+00:00,"Rewind|""Rewind 2017""|""youtube rewind 2017""|""#Y...",169884583,3312868,1753274,...,False,False,False,"YouTube Rewind 2017. Celebrating the videos, p...",GB,1.889532,17:50,2017,12,24
17,xpVfcZ0ZcFM,2018-02-19,Drake - Godâs Plan,DrakeVEVO,10,2018-02-17 05:00:01+00:00,"Drake new music|""Drake Gods Plan""|""Drake Godâ...",17059675,1431557,12850,...,False,False,False,Godâs Plan (Official Video)\n\nSong Availabl...,RU,111.405214,05:00,2018,2,19
69,FlsCjmMhFmw,2017-12-11,YouTube Rewind: The Shape of 2017 | #YouTubeRe...,YouTube Spotlight,24,2017-12-06 17:58:51+00:00,"Rewind|""Rewind 2017""|""youtube rewind 2017""|""#Y...",113876217,2811217,1470386,...,False,False,False,"YouTube Rewind 2017. Celebrating the videos, p...",DE,1.91189,17:50,2017,12,11
10,xpVfcZ0ZcFM,2018-02-22,Drake - Godâs Plan,DrakeVEVO,10,2018-02-17 05:00:01+00:00,"Drake new music|""Drake Gods Plan""|""Drake Godâ...",33591858,2152153,24525,...,False,False,False,Godâs Plan (Official Video)\n\nSong Availabl...,DE,87.753435,05:00,2018,2,22
63,xmU0s2QtaEY,2017-11-29,Swag Se Swagat Song | Tiger Zinda Hai | Salman...,YRF,10,2017-11-21 07:03:38+00:00,"Salman Khan|""Katrina Kaif""|""Tiger Zinda Hai So...",40953758,679940,69161,...,False,False,False,âºWatch the Tiger Zinda Hai Official Trailer:...,IN,9.831263,07:00,2017,11,29
73,FlsCjmMhFmw,2017-12-11,YouTube Rewind: The Shape of 2017 | #YouTubeRe...,YouTube Spotlight,24,2017-12-06 17:58:51+00:00,"Rewind|""Rewind 2017""|""youtube rewind 2017""|""#Y...",113876217,2811223,1470388,...,False,False,False,"YouTube Rewind 2017. Celebrating the videos, p...",KR,1.911892,17:50,2017,12,11
9,xpVfcZ0ZcFM,2018-02-24,Drake - Godâs Plan,DrakeVEVO,10,2018-02-17 05:00:01+00:00,"Drake new music|""Drake Gods Plan""|""Drake Godâ...",47362934,2469057,31843,...,False,False,False,Godâs Plan (Official Video)\n\nSong Availabl...,CA,77.538454,05:00,2018,2,24
54,TIE92mUvSsw,2018-06-12,Dhadak | Official Trailer | Janhvi & Ishaan | ...,Dharma Productions,1,2018-06-11 06:50:41+00:00,"Dharma|""Dharma Productions""|""Dhadak""|""Janhvi K...",15969920,308382,36675,...,False,False,False,"When two worlds collide, they become oneDhadak...",JP,8.408507,06:50,2018,6,12
68,FlsCjmMhFmw,2017-12-13,YouTube Rewind: The Shape of 2017 | #YouTubeRe...,YouTube Spotlight,24,2017-12-06 17:58:51+00:00,"Rewind|""Rewind 2017""|""youtube rewind 2017""|""#Y...",137843120,3014479,1602383,...,False,False,False,"YouTube Rewind 2017. Celebrating the videos, p...",CA,1.881247,17:50,2017,12,13


In [153]:
#Question 14
#Read all JSON files with the video categories


import json
import glob

# Step 1: Get all JSON category files
json_files = glob.glob("trendingYT/*_category_id.json")

category_dfs = []

# Step 2: Read each JSON file
for file in json_files:
    country = file.split("/")[-1][:2]
    
    with open(file, "r", encoding="utf-8") as f:
        data_json = json.load(f)
    
    # Step 3: Extract categories
    categories = data_json["items"]
    
    df_cat = pd.json_normalize(categories)
    
    # Step 4: Keep useful columns
    df_cat = df_cat[["id", "snippet.title"]]
    df_cat.columns = ["category_id", "category_name"]
    
    # Step 5: Add country
    df_cat["country"] = country
    
    category_dfs.append(df_cat)

# Step 6: Combine all categories
categories_df = pd.concat(category_dfs, ignore_index=True)

# Check result
categories_df.head()


Unnamed: 0,category_id,category_name,country
0,1,Film & Animation,IN
1,2,Autos & Vehicles,IN
2,10,Music,IN
3,15,Pets & Animals,IN
4,17,Sports,IN


In [154]:
categories_df.shape

(311, 3)

In [155]:
categories_df["country"].unique()


array(['IN', 'RU', 'JP', 'FR', 'GB', 'KR', 'MX', 'CA', 'US', 'DE'],
      dtype=object)

In [157]:
#Question 15
#For each country, determine how many videos have a category that is not assignable

#Video records were matched with category information using the
#category_id and country. Videos whose category could not be matched to
#any category in the corresponding JSON file were considered
#unassignable. The number of such videos was then counted for each
#country.

# Step 1: Ensure category_id is same type
data["category_id"] = data["category_id"].astype(str)
categories_df["category_id"] = categories_df["category_id"].astype(str)

# Step 2: Left merge videos with categories
merged = data.merge(
    categories_df,
    on=["category_id", "country"],
    how="left"
)

# Step 3: Videos with unassignable categories
unassignable = merged[merged["category_name"].isna()]

# Step 4: Count per country
unassignable_count = (
    unassignable
    .groupby("country")
    .size()
    .reset_index(name="unassignable_videos")
)

unassignable_count


Unnamed: 0,country,unassignable_videos
0,CA,69
1,DE,228
2,FR,85
3,GB,90
4,IN,41
5,JP,18
6,KR,280
7,MX,149
8,RU,1301
