# No Copyright Sounds YouTube Data Manipulation

### Import Modules

In [1]:
import pandas as pd
import requests
import urllib.request
from bs4 import BeautifulSoup
import re

### Load/Filter Data

The code below loads the data from a csv and then filters to just youtube videos and where the title contains 'NCS Release'. It also adds columns for artist, song, videoURL and an index.

In [2]:
def get_data(loc):
    
    # Load csv file from location specified
    channel_data = pd.read_csv(loc, index_col = None)
    
    # Remove duplicate titles
    channel_data = channel_data.drop_duplicates(subset='title', keep='last')
    
    # Filtering for any actual releases using the title and kind of video
    channel_data = channel_data[channel_data['title'].str.contains("NCS Release")]
    channel_data = channel_data[channel_data['kind.1'] == 'youtube#video']
    
    # Arrange by the publishing time
    channel_data = channel_data.sort_values('publishTime')
    
    # Select required columns
    channel_data = channel_data[['channelTitle', 'channelId.1', 'publishTime','title', 'videoId', 'description']]
    
    # Create a column for the URL
    channel_data['videoURL'] = 'https://www.youtube.com/watch?v=' + channel_data['videoId']
    
    # Create a column for the artist
    channel_data['artist'] = channel_data['title'].str.split(' -').str.get(0)
    
    # Create a column for the song
    channel_data['song'] = channel_data['title'].str.extract(r'^.+? - (.+?) \[NCS Release\]$')
    
    # Add index column, remove original index then create new index.
    channel_data.reset_index(drop = False, inplace = True)
    channel_data['index'] = [i+1 for i,_ in enumerate(channel_data.index)]
    
    return channel_data

### Extract YouTube Descriptions

The code below takes a url from YouTube and returns the description firstly, and then manipulates it to only contain the part of the description that the channel says you should post if you use the music.

In [3]:
def extract_description(video_url):
    # This gets the response from the URL
    response = requests.get(video_url)
    # Uses BeautifulSoup to parse the url response
    soup = BeautifulSoup(response.content, 'html.parser')
    # Uses a pattern to get the description
    pattern = re.compile('(?<=shortDescription":").*(?=","isCrawlable)')
    # Return strings fitting the pattern
    description_list = pattern.findall(str(soup))
    # If a string was returned then match with the pattern
    if description_list:
        description = description_list[0].replace('\\n','\n')
        # The pattern for the string we are aiming to pull
        desc_pattern = r'put this in your description:\n\n(.*?)\n\n- - - - - - - - - - -'
        # Pull the data for this pattern
        match = re.search(desc_pattern, description, re.DOTALL)
        if match:
            output = match.group(1)
            return output
    return ""

### Using the Functions

In [4]:
channel_data = get_data("youtubedata.csv")

In [None]:
# loop the code across every value in the column
desc_required_list = []
for video_url in channel_data['videoURL']:
    desc_required_list.append(extract_description(video_url))
    
# create a new column with the extracted descriptions
channel_data['desc_required'] = desc_required_list

In [None]:
desc_required_list

In [None]:
from pytube import YouTube

# where to save. 
# replce /home/balasundar/Downloads/ with the path where you want to store the dowload file
#destination = "/home/balasundar/Downloads/"
# link of the video to be downloaded
# Replace with the Youtube video link you want to download.
video_link = "https://www.youtube.com/watch?v=cb4gBO4TjSI"

try:
    video = YouTube(video_link)
    # filtering the audio. File extension can be mp4/webm
    # You can see all the available streams by print(video.streams)
    audio = video.streams.filter(only_audio=True, file_extension='mp4').first()
    audio.download()
    print('Download Completed!')

except:
    print("Connection Error")  # to handle exception
