In [3]:
# project: p11
# submitter: ashik
# partner: None
# hours: 6

In [37]:
# import statements

import os
import csv
import json
import copy
import pandas
import matplotlib

from collections import namedtuple
from datetime import datetime as dt #this is for a time_delta only
from questions import view_plot_data, verify_bar, verify_scatter


In [79]:
%matplotlib inline

In [80]:
## From lab-p9

def plot_dict(d, label="Please Label Me!!!"):
    ax = pandas.Series(d).sort_index().plot.bar(color="black", fontsize=16)
    ax.set_ylabel(label, fontsize=16)

    
## From lab-p10

def process_csv(filename):
    exampleFile = open(filename, encoding="utf-8")  
    exampleReader = csv.reader(exampleFile) 
    exampleData = list(exampleReader)        
    exampleFile.close()  
    return exampleData


## From lab-p10

def read_json(path):
    with open(path, encoding='utf-8') as f:
        return json.load(f)
    

## From lab-p10

def list_files_in(pathname):
    """
    Given a (relative) path called pathname, return a list of files at this path. 
    Make sure to exclude files that start with a "." from your list, 
    and that your list of files is sorted alphabetically before you return it.
    """
    
    files_list = os.listdir("./" + pathname)
    
    files_list.sort()
    
    files_list = [file for file in files_list if file[0] != "."]
    
    return files_list


## From lab-p10

def list_paths_in(pathname):
    """
    Gets a list of files in pathname directory and generates relative paths to all the files,
    using os.path.join function.
    Returns a list of relative paths to each file inside pathname directory.
    """
    files = list_files_in(pathname)
    
    specific_file_path = [os.path.join(pathname, file) for file in files]

    return specific_file_path


## From p10 (Q2)
## Use `list_paths_in` to list all paths in the data directory
## Save this to a variable called all_paths

all_paths = list_paths_in("data")


## From p10 (Q3)
## Code to populate `comment_paths`

comment_paths = [path for path in all_paths if "comment_data" in path]


## From p10 (Q4)
## Code to populate `channel_paths` 

channel_paths = [path for path in all_paths if "channel_ids" in path]


# Using the get_mapping function from lab10

def get_mapping(pathname):
    """
    Given a path called pathname, load the json data at the path and return the loaded json data.
    If a json.JSONDecodeError is thrown, an empty dictionary is returned.
    """
    try:
        mapped_data = read_json(pathname) 
        
    except json.JSONDecodeError:
        mapped_data = {}

    return mapped_data


## From p10 (Q6)
## Code to populate `channel_dict` 

channel_dict = {}

for file in channel_paths:
    channel_dict.update(get_mapping(file))

    
# Creating a namedtuple to represent a Comment that has the following attributes...
# video_id, author_id, and published_at are strings and comment_length and likes are ints. 

Comment = namedtuple("Comment", ["video_id", "comment_length", "author_id", "likes", "published_at"])


## From p10 (Q7)

def get_comment_data(comment_file):
    csv_data = process_csv(comment_file)
    
    header = csv_data[0] 
    comment_rows = csv_data[1:]

    comment_id_idx = header.index("comment_id")
    video_id_idx = header.index("video_id")
    comment_length_idx = header.index("comment_length")
    author_id_idx = header.index("author_id")
    likes_idx = header.index("likes")
    published_at_idx = header.index("published_at")

    comment_dict = {}
    
    for comment in comment_rows:
        if "" in comment:
            continue
 
        try:     
            key = comment[comment_id_idx]

            value = Comment(comment[video_id_idx], int(comment[comment_length_idx]), \
                            comment[author_id_idx], int(comment[likes_idx]), comment[published_at_idx])

            comment_dict[key] = value
        
        except ValueError:
            continue

        except IndexError:
            continue
            
    return comment_dict


## From p10 (Q8)
## Code to populate `comments` 

comments = {}

for file in comment_paths:
    comments.update(get_comment_data(file))


## From p10 (Q12)
## Code to populate `comment_buckets` 

comment_buckets = {}

for comment in comments:
    video_id = comments[comment].video_id
    
    if video_id not in comment_buckets:
        comment_buckets[video_id] = []
    
    if video_id in comment_buckets:
        comment_buckets[video_id].append(comment)


## From p10

def get_videos(data_file, video_mapping_file):
    """
    Given data_file (csv file) and video_mapping_file (json) file, generates a video
    dictionary, mapping video ID to a dictionary containing title, and other details of the video.
    Handles missing entry in video_ids.json by using try / except blocks to handle KeyError.
    """
    data = process_csv(data_file)
    
    header = data[0]
    all_videos = data[1:]
    
    video_mapping = get_mapping(video_mapping_file)
    videos_dict = dict()
    
    for video in all_videos:
        try:
            key = video[header.index('video_id')]
            
            title = video_mapping[key]
            channel_name = channel_dict[video[header.index('channel_id')]]
            published_at = video[header.index('published_at')] 
            duration = video[header.index('duration')] 
            category = video[header.index('category')] 
            views = int(video[header.index('views')])
            tags = video[header.index('tags')].split("|")
            likes = video[header.index('likes')]
            dislikes = video[header.index('dislikes')]
            comments = comment_buckets[key]
            

            if likes != "" and dislikes != "": 
                ratings_enabled = True
                likes = int(likes)
                dislikes = int(dislikes)

            else:
                ratings_enabled = False
                likes = None
                dislikes = None
            
            value = {'title': title, 
                     'channel_name': channel_name,
                     'published_at': published_at, 
                     'duration': duration, 
                     'category': category, 
                     'views': views,
                     'tags': tags,
                     'likes': likes,
                     'dislikes': dislikes,
                     'ratings_enabled': ratings_enabled,
                     'comments': comments
                    }
            
            videos_dict[key] = value
        
        except KeyError:
            continue
    
    return videos_dict


## From p10 (Q13), create a dictionary named `videos` and use `get_videos` to fill the information in the 
## `videos`.
## Populate the variable named `videos` to answer

videos = get_videos(os.path.join('data','video_data.csv'), os.path.join('data','video_ids.json'))

sample = ['UgzgwN2JXxjTN4mR5954AaABAg.9TPxukUd20g9TQLnJi3RFU', 'UgzvogxMg82Kj0aW84x4AaABAg']
for s in sample:
    assert s in videos['fkMW60W180E']['comments']
    

## From p10 (Q17)
## Make sure to remove default argument to videos. If you want to retain it,
## you will have to define get_videos function before bucketize and make sure
## you call get_videos and store the return value into the variable videos

# Implementing bucketize function similar to P9

def bucketize(attribute, videos = videos):
    """
    Input: A dict of videos and an attribute string.
    Return: A dictionary that bucketizes the `video_ids` by `attribute`.
    """
    bucketized_dict = {}

    list_atr = ["tags", "comments"]
    
    for video in videos:
        atr = videos[video][attribute]

        if attribute in list_atr:
            for item in atr:
                if item not in bucketized_dict:
                    bucketized_dict[item] = []  
                
                if item in bucketized_dict:
                    bucketized_dict[item].append(video)
        
        else:
            if atr not in bucketized_dict:
                bucketized_dict[atr] = []

            if atr in bucketized_dict:
                bucketized_dict[atr].append(video)

    return bucketized_dict

category_dict = bucketize('category')
assert category_dict['Pets & Animals'] == ['Hz_DslzN2IA', 'AwvyrO_yM4c']


In [1]:
# Function requirement

def process_duration(duration_str):

    duration = int(duration_str[0:2]) * 3600 + int(duration_str[3:5]) * 60 + int(duration_str[6:8])
    value = (duration // 300) * 5

    return value

process_duration('00:04:12') #returns 0
process_duration('00:28:21') #returns 25

25

In [2]:
#Q1: Plot the distribution of video durations as a histogram.



In [3]:
#Q1.1 Verify your plot using verify_bar



In [None]:
#Q2: Plot the correlation between duration of video (x-axis) and average views (y-axis) as a histogram.


In [None]:
#Q2.1 Verify your plot using verify_bar



In [None]:
#Q3: Plot the correlation between the hour at which the video is published (x-axis) and the average number of views (y-axis) across all videos in that hour as a histogram.


In [None]:
#Q3.1 Verify your plot using verify_bar



In [None]:
#Q4: Plot the correlation between the views (x-axis) and likes (y-axis) on a video.


In [None]:
#Q4.1 Verify your plot using verify_scatter


In [None]:
#Q5: Plot the correlation between the views (x-axis) and likes (y-axis) on a video with the outliers removed.


In [None]:
#Q5.1 Verify your plot using verify_scatter


In [None]:
#Q6: Plot the correlation between the views (x-axis) and audience engagement (y-axis) on a video.


In [None]:
#Q6.1 Verify your plot using verify_scatter

In [None]:
#Q7: Plot the correlation between the views (x-axis) and audience engagement (y-axis) on a video with the outliers removed.


In [None]:
#Q7.1 Verify your plot using verify_scatter


In [None]:
#Q8: Plot the correlation between the length of a comment text (x-axis) and the number of likes on the comment (y-axis).


In [None]:
#Q8.1 Verify your plot using verify_scatter


In [None]:
#Q9: Plot the correlation between the length of a comment text (x-axis) and the number of likes on the comment (y-axis) with the outliers removed.



In [None]:
#Q9.1 Verify your plot using verify_scatter


In [None]:
#Q10: What is the length of the first comment under the video "If I lose a boss fight, the video ends 2 (Genshin Impact)"?



In [None]:
#Q11: What is the average number of likes for the first 5 comments under the video 'Facebook, Instagram and WhatsApp down in global outage'?



In [None]:
#Q12: Plot the correlation between when a comment is published relative to video publish time (x-axis) and the number of likes on the comment (y-axis).


In [None]:
#Q12.1 Verify your plot using verify_scatter


In [None]:
#Q13: Plot the correlation between when a comment is published relative to video publish time (x-axis) and the length of the comment (y-axis).


In [None]:
#Q13.1 Verify your plot using verify_scatter


In [None]:
#Q14: What are the paths of the files in the special directory of the broken_file directory?


In [None]:
#Q15: What are the paths of the files in the rest directory of the non_english directory of the broken_file directory?



In [None]:
#Q16: What are the paths of the files in the L_to_Q directory of the english_uppercase directory of the broken_file directory?



In [None]:
#Q17: What are the paths of the files in the broken_file directory?


In [None]:
#Q18: How many channel mappings are in the files in the broken_file directory?


In [None]:
#Q19: What is the name of the channel with the channel ID UCwtzSiUayocxaOHLnHe90Hw?


In [None]:
#Q20: What are the names of the top 5 channels from the broken_file with the most likes on their comments?
