In [1]:
# project: p10
# submitter: ashik
# partner: None
# hours: 6

In [2]:
import os
import csv
import json

from collections import namedtuple
from json import JSONDecodeError


In [3]:
# Q1: What are the names of the files present in the data directory?

def list_files_in(pathname):
    """
    Given a (relative) path called pathname, return a list of files at this path. 
    Make sure to exclude files that start with a "." from your list, 
    and that your list of files is sorted alphabetically before you return it.
    """
    
    files_list = os.listdir("./" + pathname)
    
    files_list.sort()
    
    files_list = [file for file in files_list if file[0] != "."]
    
    return files_list

list_files_in("data")


['channel_ids1.json',
 'channel_ids2.json',
 'channel_ids3.json',
 'channel_ids4.json',
 'channel_ids5.json',
 'comment_data1.csv',
 'comment_data2.csv',
 'comment_data3.csv',
 'comment_data4.csv',
 'comment_data5.csv',
 'video_data.csv',
 'video_ids.json']

In [4]:
# Q2: What are the paths of all the files in the data directory?

def list_paths_in(pathname):
    """
    Gets a list of files in pathname directory and generates relative paths to all the files,
    using os.path.join function.
    Returns a list of relative paths to each file inside pathname directory.
    """
    files = list_files_in(pathname)
    
    specific_file_path = [os.path.join(pathname, file) for file in files]

    return specific_file_path

data_paths_list = list_paths_in("data")

data_paths_list


['data/channel_ids1.json',
 'data/channel_ids2.json',
 'data/channel_ids3.json',
 'data/channel_ids4.json',
 'data/channel_ids5.json',
 'data/comment_data1.csv',
 'data/comment_data2.csv',
 'data/comment_data3.csv',
 'data/comment_data4.csv',
 'data/comment_data5.csv',
 'data/video_data.csv',
 'data/video_ids.json']

In [5]:
# Q3: What are the paths of all the files whose name contains comment_data in our data directory?

comment_paths = [path for path in data_paths_list if "comment_data" in path]

comment_paths


['data/comment_data1.csv',
 'data/comment_data2.csv',
 'data/comment_data3.csv',
 'data/comment_data4.csv',
 'data/comment_data5.csv']

In [6]:
# Q4: What are the paths of all the files whose name contains channel_ids in our data directory?

channel_paths = [path for path in data_paths_list if "channel_ids" in path]

channel_paths


['data/channel_ids1.json',
 'data/channel_ids2.json',
 'data/channel_ids3.json',
 'data/channel_ids4.json',
 'data/channel_ids5.json']

In [7]:
# Using the read_json function from lab10

def read_json(path):
    with open(path, encoding="utf-8") as f:
        return json.load(f)


In [8]:
# Using the get_mapping function from lab10

def get_mapping(pathname):
    """
    Given a path called pathname, load the json data at the path and return the loaded json data.
    If a json.JSONDecodeError is thrown, an empty dictionary is returned.
    """
    try:
        mapped_data = read_json(pathname) 
        
    except JSONDecodeError:
        mapped_data = {}

    return mapped_data


In [9]:
# Q5: What is the name of the channel with the ID UCNye-wNBqNL5ZzHSJj3l8Bg?

mapping = get_mapping(os.path.join("data", "channel_ids1.json"))

mapping['UCNye-wNBqNL5ZzHSJj3l8Bg']


'Al Jazeera English'

In [10]:
# Q6: How many channels do we have in the entire dataset?

channel_dict = {}

for file in channel_paths:
    mapping = get_mapping(file)

    for key,value in mapping.items():
        channel_dict[key] = value

num_of_channels = len(channel_dict)

num_of_channels


372

In [11]:
# Creating a namedtuple to represent a Comment that has the following attributes...
# video_id, author_id, and published_at are strings and comment_length and likes are ints. 

Comment = namedtuple("Comment", ["video_id", "comment_length", "author_id", "likes", "published_at"])


In [12]:
# Using process_csv function for accessing data

def process_csv(filename):
    exampleFile = open(filename, encoding="utf-8")  
    exampleReader = csv.reader(exampleFile) 
    exampleData = list(exampleReader)        
    exampleFile.close()  
    return exampleData 

def get_comment_data(comment_file):
    csv_data = process_csv(comment_file)
    
    header = csv_data[0] 
    comment_rows = csv_data[1:]

    comment_id_idx = header.index("comment_id")
    video_id_idx = header.index("video_id")
    comment_length_idx = header.index("comment_length")
    author_id_idx = header.index("author_id")
    likes_idx = header.index("likes")
    published_at_idx = header.index("published_at")

    comment_dict = {}
    
    for comment in comment_rows:
        if "" in comment:
            continue
 
        try:     
            key = comment[comment_id_idx]

            value = Comment(comment[video_id_idx], int(comment[comment_length_idx]), \
                            comment[author_id_idx], int(comment[likes_idx]), comment[published_at_idx])

            comment_dict[key] = value
        
        except ValueError:
            continue

        except IndexError:
            continue
            
    return comment_dict


In [13]:
# Q7: What is the Comment object with comment ID UgygOezB4Mvd5o6FgAt4AaABAg?

comment1 = get_comment_data("data/comment_data1.csv")

comment1['UgygOezB4Mvd5o6FgAt4AaABAg']


Comment(video_id='udNXMAflbU8', comment_length=175, author_id='UCHkk7x38KWgqjQOHqsQwf0Q', likes=47, published_at='2021-10-10 17:48:38')

In [14]:
# Q8: What is the length of the comment with ID UgztIaGfqFoiGvbOdfp4AaABAg?

comments = {}

for file in comment_paths:
    comments.update(get_comment_data(file))
    
comments['UgztIaGfqFoiGvbOdfp4AaABAg'].comment_length


67

In [15]:
# Q9: What percentage of comments are at most 140 characters long?

total_comments = len(comments)
comments_atmost_140 = 0

for comment in comments:
    length = comments[comment].comment_length

    if length <= 140:
        comments_atmost_140 += 1
        
percent = (comments_atmost_140 / total_comments) * 100

percent


86.86953042956443

In [16]:
# Q10: What is the author ID of the comment that has the highest number of likes?

highest_likes = None
most_liked_author_id = None

for comment in comments:
    likes = comments[comment].likes

    if highest_likes == None or likes > highest_likes:
        highest_likes = likes
        most_liked_author_id = comments[comment].author_id

most_liked_author_id


'UCIPPMRA040LQr5QPyJEbmXA'

In [17]:
# Q11: What is the most popular hour for publishing comments?

hours_list = [int(comments[comment].published_at[-8:-6]) for comment in comments]
unique_hours = set(hours_list)

most_comments_per_hour = None
most_popular_hour = None

for hour in unique_hours:
    num_of_comments = hours_list.count(hour)

    if most_comments_per_hour == None or num_of_comments > most_comments_per_hour:
        most_comments_per_hour = num_of_comments
        most_popular_hour = hour
    
most_popular_hour


19

In [18]:
# Bucketizing the comments data by creating a dict mapping video IDs to 
# a list of comment IDs corresponding to that video ID

comment_buckets = {}

for comment in comments:
    video_id = comments[comment].video_id
    
    if video_id not in comment_buckets:
        comment_buckets[video_id] = []
    
    if video_id in comment_buckets:
        comment_buckets[video_id].append(comment)


In [19]:
# Q12: How many comments does the video with ID A8rrr_w8rfk have?

len(comment_buckets['A8rrr_w8rfk'])


606

In [20]:
# Using the get_videos function from lab10

def get_videos(data_file, video_mapping_file):
    """
    Given data_file (csv file) and video_mapping_file (json) file, generates a video
    dictionary, mapping video ID to a dictionary containing title, and other details of the video.
    Handles missing entry in video_ids.json by using try / except blocks to handle KeyError.
    """
    data = process_csv(data_file)
    
    header = data[0]
    all_videos = data[1:]
    
    video_mapping = get_mapping(video_mapping_file)
    videos_dict = dict()
    
    for video in all_videos:
        try:
            key = video[header.index('video_id')]
            
            title = video_mapping[key]
            channel_name = channel_dict[video[header.index('channel_id')]]
            published_at = video[header.index('published_at')] 
            duration = video[header.index('duration')] 
            category = video[header.index('category')] 
            views = int(video[header.index('views')])
            tags = video[header.index('tags')].split("|")
            likes = video[header.index('likes')]
            dislikes = video[header.index('dislikes')]
            comments = comment_buckets[key]
            

            if likes != "" and dislikes != "": 
                ratings_enabled = True
                likes = int(likes)
                dislikes = int(dislikes)

            else:
                ratings_enabled = False
                likes = None
                dislikes = None
            
            value = {'title': title, 
                     'channel_name': channel_name,
                     'published_at': published_at, 
                     'duration': duration, 
                     'category': category, 
                     'views': views,
                     'tags': tags,
                     'likes': likes,
                     'dislikes': dislikes,
                     'ratings_enabled': ratings_enabled,
                     'comments': comments
                    }
            
            videos_dict[key] = value
        
        except KeyError:
            continue
    
    return videos_dict

videos = get_videos(os.path.join('data','video_data.csv'), os.path.join('data','video_ids.json'))

sample = ['UgzgwN2JXxjTN4mR5954AaABAg.9TPxukUd20g9TQLnJi3RFU', 'UgzvogxMg82Kj0aW84x4AaABAg']
for s in sample:
    assert s in videos['fkMW60W180E']['comments']


In [21]:
# Q13: What is the video with ID fkMW60W180E?

videos['fkMW60W180E']


{'title': 'SWAWS | Totally Accurate Battlegrounds',
 'channel_name': 'TheRussianBadger',
 'published_at': '2021-10-12 19:01:41',
 'duration': '00:18:46',
 'category': 'Gaming',
 'views': 3172185,
 'tags': ['tot',
  'totally accurate battlegrounds',
  'tabg',
  'totally accurate battle simulator',
  'totally accurate battlegrounds gameplay',
  'totally accurate battle grounds',
  'tabg gameplay',
  'tabg game',
  'tabs',
  'totally accurate',
  'totally accurate battle royale',
  'battle royale',
  'tabg funny',
  'fortnite',
  'battlegrounds',
  'tabs battle royale',
  'new battle royale',
  'pubg',
  'totally accurate battlegrounds funny',
  'swaws',
  'swaws meme',
  'swaws russian badger',
  'tabg update',
  'tabg win',
  'tabg br',
  'free to play pc games',
  'free to play'],
 'likes': 210951,
 'dislikes': 1824,
 'ratings_enabled': True,
 'comments': ['UgzgwN2JXxjTN4mR5954AaABAg.9TPxukUd20g9TQLnJi3RFU',
  'Ugw8z3sbkSQQpgINHdl4AaABAg',
  'UgyjZAl0XlNg-gOZ0jR4AaABAg.9TPyCmI6pbl9TQAX

In [22]:
# Q14: What is the title of the video with ID gF69voHU_ys?

videos['gF69voHU_ys']['title']


'A Mukbanger Ate 1 Gallon Pickles. This Is What Happened To Her Brain.'

In [23]:
# Q15: Among the videos with more than 1 million views, what is the title of the video with the highest likes to views ratio?

highest_likes_to_views_ratio = None
highest_likes_to_views_title = None

for video in videos:
    ratings = videos[video]['ratings_enabled']
    
    if ratings != True:
        continue
     
    views = videos[video]['views']
    likes = videos[video]['likes']
    title = videos[video]['title']
        
    if views > 1000000:
        likes_to_views_ratio = likes / views
        
        if highest_likes_to_views_ratio == None or likes_to_views_ratio > highest_likes_to_views_ratio:
            highest_likes_to_views_ratio = likes_to_views_ratio
            highest_likes_to_views_title = title
            
highest_likes_to_views_title


'[In the SOOP BTS ver. Season 2] Official Teaser 2'

In [24]:
# Q16: What is the author ID of the most liked comment under the video titled 'Giving Away My Beard For Charity!'?

given_title = 'Giving Away My Beard For Charity!'

most_likes_for_comment = None
author_id_of_most_liked_comment = None

for video in videos:
    title = videos[video]['title']
    
    if title == given_title:
        given_video_comments = videos[video]['comments']
        
        for comment in given_video_comments:
            likes = comments[comment].likes
            author_id = comments[comment].author_id

            if most_likes_for_comment  == None or likes > most_likes_for_comment:
                most_likes_for_comment  = likes 
                author_id_of_most_liked_comment = author_id
    
author_id_of_most_liked_comment


'UCRvcDpyxCVmlLI8Q2TjRpmg'

In [25]:
# Implementing bucketize function similar to P9

def bucketize(attribute, videos=videos):
    """
    Input: A dict of videos and an attribute string.
    Return: A dictionary that bucketizes the `video_ids` by `attribute`.
    """
    bucketized_dict = {}

    list_atr = ["tags", "comments"]
    
    for video in videos:
        atr = videos[video][attribute]

        if attribute in list_atr:
            for item in atr:
                if item not in bucketized_dict:
                    bucketized_dict[item] = []  
                
                if item in bucketized_dict:
                    bucketized_dict[item].append(video)
        
        else:
            if atr not in bucketized_dict:
                bucketized_dict[atr] = []

            if atr in bucketized_dict:
                bucketized_dict[atr].append(video)

    return bucketized_dict

category_dict = bucketize('category')
assert category_dict['Pets & Animals'] == ['Hz_DslzN2IA', 'AwvyrO_yM4c']


In [26]:
# Q17: Which video titles were produced by the "Corridor Crew" channel?

given_channel = 'Corridor Crew'

channel_buckets = bucketize('channel_name')

given_channel_video_ids = channel_buckets[given_channel]

given_channel_video_titles = [videos[video]['title'] for video in given_channel_video_ids]

given_channel_video_titles


['VFX Artists React to SHANG-CHi Bad & Great CGi',
 'VFX Artists React to Bad & Great CGi 54 (ft. SETH ROGEN)']

In [27]:
# Q18: What are the top 5 channels that have the most total comments on their videos?

channel_total_comments = {}

for channel in channel_buckets:
    video_ids = channel_buckets[channel]
    total_comments = 0
    
    for video in video_ids:
        num_of_comments = len(videos[video]['comments'])
        total_comments += num_of_comments
        
    channel_total_comments[channel] = total_comments

sorted_total_comments = sorted(channel_total_comments.items(), key = lambda s:s[-1], reverse = True)
top_5_channel_names = [k for (k,v) in sorted_total_comments[:5]]

top_5_channel_names


['NFL', 'Minecraft', 'Saturday Night Live', 'SSSniperWolf', 'PBC ON FOX']

In [30]:
# Q19: List all the unique video titles which have Minecraft tags.

given_tag = 'minecraft'

tag_buckets = bucketize('tags')

minecraft_tags = [tag for tag in tag_buckets if given_tag in tag.lower()]

minecraft_videos_list = []

for tag in minecraft_tags:
    video_ids = tag_buckets[tag]
    
    for video in video_ids:
        title = videos[video]['title']
        minecraft_videos_list.append(title)

minecraft_videos_list = list(set(minecraft_videos_list))

minecraft_videos_list


['Minecraft but all Mobs are Dragons',
 'Minecraft Mobs if they were Parents',
 'Hermitcraft 8 Episode 14:  BETRAYING BOATEM',
 'I Survived 69 Hours Using Only Minecraft Tools',
 'I Built an AUTOMATIC Mob Loot Farm in Minecraft Hardcore! (#22)',
 '"Eternal" - A Minecraft Music Video ♪',
 'Jumping SQUID GAME CARS Across ENTIRE MAP In GTA 5! (Mods)',
 'Realistic Water in Minecraft...',
 'Minecraft Mobs if they were Superheroes',
 'Realistic Golem Transforming',
 'Playing SQUID GAME in Minecraft!',
 'Minecraft Manhunt but I have CUSTOM COMMANDS',
 'What Mob Should Be Added To Minecraft?',
 'She Wants Us DEAD! 💀 Dark Deception Chapter 4 (FGTeeV Telekinesis Gameplay)',
 'I Won SQUID GAME In GTA 5! (Mods)',
 'I Survived 1,900 Days in HARDCORE Minecraft...',
 'I made a custom mod in minecraft with your ideas',
 'Minecraft Mobs when you log off...',
 'I Survived 100 Days as a FOX in HARDCORE Minecraft!',
 "2b2t's Obsidian War is Getting Ridiculous",
 "Minecraft, But There's Custom End City Ite

In [29]:
# Q20: List the titles of the 5 shortest videos.

title_duration = {videos[video]['title']:videos[video]['duration'] for video in videos}
    
sorted_title_duration = sorted(title_duration.items(), key = lambda s:s[-1])

shortest_5_video_titles = [k for (k,v) in sorted_title_duration[:5]]
    
shortest_5_video_titles


['When someone walks in while you using the bathroom😭',
 'Smart Appliances, Gadgets For Every Home 🤩 Versatile Utensils, Kitchen, Makeup, Beauty #Shorts',
 'Italians reacting to ketchup on pasta 😱🇮🇹 #shorts',
 'My hidden talent #shorts',
 'Knot']