In [1]:
# Check we have the required packages

import subprocess
import pkg_resources
import sys

def install_package(package_name):
    """Install the given package using pip."""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} installed successfully.")
    except subprocess.CalledProcessError:
        print(f"Failed to install {package_name}.")

def check_and_install_packages(packages):
    """Check if packages are installed, and install them if they are not."""
    for package in packages:
        try:
            # Check if package is installed by trying to load it
            pkg_resources.require(package)
            print(f"{package} is already installed.")
        except pkg_resources.DistributionNotFound:
            # If not found, install the package
            print(f"{package} not found. Installing...")
            install_package(package)

# List of packages to check and install
packages_to_check = [
    "pytube", "moviepy", "SpeechRecognition", "spacy", "nltk", "nrclex", "textblob", "translate"
]

check_and_install_packages(packages_to_check)


pytube is already installed.


  import pkg_resources


moviepy is already installed.
SpeechRecognition is already installed.
spacy is already installed.
nltk is already installed.
nrclex is already installed.
textblob is already installed.
translate is already installed.


In [2]:
# Standard library imports
import json
import logging
import os
import time
from datetime import datetime
from pathlib import Path
from threading import Semaphore, Lock

# Third-party imports for concurrent and multiprocessing tasks
import concurrent.futures
import multiprocessing
import threading
from multiprocessing import Process, Manager

# NLTK and Spacy for natural language processing
from nltk import download
import spacy

# Libraries for video and audio processing
from moviepy.editor import VideoFileClip
import speech_recognition as sr

# Libraries for web content and translation
from pytube import YouTube
from translate import Translator

# Library for emotion analysis
from nrclex import NRCLex

# TextBlob for sentiment analysis
from textblob import TextBlob

# Download necessary NLTK data and load Spacy model
download('punkt')
nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to
[nltk_data]     /home/dt_cloud_computing/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Daniel Timms

In [3]:
# Let's install our functions from video_processing.py
from video_processing import *

[nltk_data] Downloading package punkt to
[nltk_data]     /home/dt_cloud_computing/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Lets create some directories 

dirs = ["video_output", "audio_output", "text_output", 
        "translated_text", "emotion_analysis", "sentiment_analysis"]

_ = [Path(d).mkdir(parents=True, exist_ok=True) for d in dirs]


## Task 1

Manually retrieve 10-15 random video URLs from YouTube.
- Save the URLs in a text file called `video_urls.txt` , where each URL should be stored on a separate line.
- Consider YouTube videos that are 2-3 minutes in duration.

In [5]:
# Here are our URLS we want to analyse
urls = [
    "https://www.youtube.com/watch?v=lXJpVz_ig2s",
    "https://www.youtube.com/watch?v=_Wk9T_G-u4o",
    "https://www.youtube.com/watch?v=LtScY2guZpo",
    "https://www.youtube.com/watch?v=CqcEW-jyDmo",
    "https://www.youtube.com/watch?v=9D-nGIEq6Ms",
    "https://www.youtube.com/watch?v=xFS7wthXIGg",
    "https://www.youtube.com/watch?v=UQeyU0YcPKY",
    "https://www.youtube.com/watch?v=_YPScrckx28",
    "https://www.youtube.com/watch?v=aeHqYLgZP84",
    "https://www.youtube.com/watch?v=NYINnu_SWHk",
    "https://www.youtube.com/watch?v=I1FKT8yHk4k"
]

In [6]:
# Define the file path
file_path = 'video_urls.txt'

# Write the URLs to the file
with open(file_path, 'w') as file:
    for url in urls:
        file.write(url + '\n')

print(f"URLs have been saved to {file_path}")

URLs have been saved to video_urls.txt


## Task 2

Develop a Python script to read the URLs.
 - Assuming you have the text file named video_urls.txt containing the URLs of YouTube videos,
 - load it in Python and extract the URLs using your preferred data structure.

In [7]:
# Define the file path
file_path = 'video_urls.txt'

# Read the URLs from the file
with open(file_path, 'r') as file:
    urls = file.readlines()

# Strip newline characters
urls = [url.strip() for url in urls]

print("URLs have been read")

URLs have been read


## Task 3

Develop a Python script to download the videos using their URLs.
- Test your solution by downloading the files serially.
- Use parallel programming such as multiprocessing or threading to handle downloads. Your decision will determine the best strategy.
- For testing reasons, ensure the script can download up to 5 videos simultaneously to avoid YouTube blocks.
- You are advised to use threads and semaphores to control the downloads.
- Compare serial and parallel executions for your video download script.
- Discuss the complexity of your video download scripts' time and space.

In [8]:
# Where the files will be saved
output_path = 'video_output/'

# Let's check the function works
youtube_downloader(urls[0], output_path)

# This should return the name of our first URL/video 
folder_contents = os.listdir(output_path)
print(folder_contents)


['MSc Health Data Science.mp4']


Now we can try to download all our videos and time how long it take to complete, serially. We'll record how long the entire task takes and check we have downloaded the videos 

In [9]:
# Start the timer
start = time.perf_counter()

# Loop through each URL and download the video 
_ = [youtube_downloader(urls[i], output_path) for i in range(len(urls))]

end = time.perf_counter()

# Print the time it took
print(f'downloading the video serially: {end-start} second(s)')

print('')

# This should return the names of all 11 videos
folder_contents = os.listdir(output_path)
_ = [print(item) for item in folder_contents]

print('')

# check the length
print(f'count of videos downloaded and saved: {len(folder_contents)}')

downloading the video serially: 17.216301873999328 second(s)

What is Big Data Analytics.mp4
Day in the Life Data Scientist.mp4
Support Vector Machine (SVM) in 2 minutes.mp4
Big Data Analytics for beginners.mp4
What is Data Science - A day in the Life of a Data Scientist by IBM 4.mp4
MSc Health Data Science.mp4
Bill Squadron – How big data analytics continues to change pro sports.mp4
Meet Claire Monteleoni Editor in Chief of Environmental Data Science.mp4
Job Outlook for Data Science  UMBC.mp4
Big Data Analytics For Business  What is Big Data Analytics  Big Data Training  Simplilearn.mp4
What is Big Data  Big Data in 2 Minutes  Introduction to Big Data  Big Data Training  Edureka.mp4

count of videos downloaded and saved: 11


That looks good. We can now attempt to download the files using a parallel method.


In [10]:
# Setup for concurrency control
max_concurrent_downloads = 5
manager = Manager()
semaphore = manager.Semaphore(max_concurrent_downloads)

# Run our two functions and compare times
download_videos_multiprocessing(urls, output_path, semaphore)
download_videos_threading(urls, output_path, semaphore)

2024-06-23 18:30:36.449 - Starting download: /watch?v=lXJpVz_ig2s
2024-06-23 18:30:36.459 - Starting download: /watch?v=_Wk9T_G-u4o
2024-06-23 18:30:36.472 - Starting download: /watch?v=LtScY2guZpo
2024-06-23 18:30:36.484 - Starting download: /watch?v=CqcEW-jyDmo
2024-06-23 18:30:36.496 - Starting download: /watch?v=9D-nGIEq6Ms
2024-06-23 18:30:37.380 - Finished download: /watch?v=_Wk9T_G-u4o
2024-06-23 18:30:37.383 - Starting download: /watch?v=xFS7wthXIGg
2024-06-23 18:30:37.590 - Finished download: /watch?v=lXJpVz_ig2s
2024-06-23 18:30:37.593 - Starting download: /watch?v=UQeyU0YcPKY
2024-06-23 18:30:37.650 - Finished download: /watch?v=LtScY2guZpo
2024-06-23 18:30:37.653 - Starting download: /watch?v=_YPScrckx28
2024-06-23 18:30:37.717 - Finished download: /watch?v=9D-nGIEq6Ms
2024-06-23 18:30:37.720 - Starting download: /watch?v=aeHqYLgZP84
2024-06-23 18:30:37.729 - Finished download: /watch?v=CqcEW-jyDmo
2024-06-23 18:30:37.731 - Starting download: /watch?v=NYINnu_SWHk
2024-06-23

## Task 4

Develop a Python script to keep a log for each download.
- After downloading each video, create a logger to record which video was downloaded by which process or thread.
- Save the log entries to the same file, e.g., download_log.txt .
- For this script, you have to use `threads` and a `mutex`.
- The entries could be in the following format:

```bash
"Timestamp": 12:23, 21 May 2024, "URL":"http://www.youtube.com/1234", "Download":True
"Timestamp": 12:25, 21 May 2024, "URL":"http://www.youtube.com/1235", "Download":True
```

In [11]:
manager = Manager()
download_semaphore = manager.Semaphore(5)  
thread_lock = threading.Lock()

# DL videos and make log
run_download_video_and_log(urls, output_path, semaphore, thread_lock)

18:30, 23 Jun 2024 - Thread 140695874615040 - Finished download: /watch?v=_Wk9T_G-u4o
18:30, 23 Jun 2024 - Thread 140695950116608 - Finished download: /watch?v=CqcEW-jyDmo
18:30, 23 Jun 2024 - Thread 140696283477760 - Finished download: /watch?v=xFS7wthXIGg
18:30, 23 Jun 2024 - Thread 140695866222336 - Finished download: /watch?v=lXJpVz_ig2s
18:30, 23 Jun 2024 - Thread 140695992063744 - Finished download: /watch?v=LtScY2guZpo
18:30, 23 Jun 2024 - Thread 140696328836864 - Finished download: /watch?v=9D-nGIEq6Ms
18:30, 23 Jun 2024 - Thread 140696241530624 - Finished download: /watch?v=UQeyU0YcPKY
18:30, 23 Jun 2024 - Thread 140696008849152 - Finished download: /watch?v=NYINnu_SWHk
18:30, 23 Jun 2024 - Thread 140696157636352 - Finished download: /watch?v=aeHqYLgZP84
18:30, 23 Jun 2024 - Thread 140696199583488 - Finished download: /watch?v=_YPScrckx28
18:30, 23 Jun 2024 - Thread 140696000456448 - Finished download: /watch?v=I1FKT8yHk4k
......................................................

## Task 5

Develop Python scripts to perform various video analysis tasks.
- After downloading a video, perform the following tasks.
- It is preferable to develop a separate script for each functionality.
- The five analysis subtasks that you have to develop include the following:
    - Extract audio from a video file.
    - Transcribe audio to text.
    - Perform the sentiment analysis on a video's content, extracting its polarity and sensitivity.
    - Translate the text into another language, e.g. Spanish.
    - Extract the emotions of a text.
- Each output task should store its results in a dedicated folder designated for each video, using the video title. Feel free to organise your folder structure as you prefer.
- You can use any library, including `moviepy` for loading video and `speech_recognition` or `textblob` for sentiment analysis.
- To implement the analysis subtasks, you must use at least one of the following libraries: `multiprocessing`, `threading`, or `asyncio`.
- You must compare serial, multiprocessing, threading, and concurrency for at least one of the subtasks, such as the extracting audio functionality. You do not have to do it for the rest of the subtasks.

In [12]:
video_output_path = 'video_output/'
audio_output_path = 'audio_output/'

# Extract audio from a video file
run_extract_audio(video_output_path, audio_output_path)

MoviePy - Writing audio in audio_output/Support Vector Machine (SVM) in 2 minutes.wav
MoviePy - Writing audio in audio_output/What is Big Data Analytics.wavMoviePy - Writing audio in audio_output/Big Data Analytics For Business  What is Big Data Analytics  Big Data Training  Simplilearn.wav

MoviePy - Writing audio in audio_output/Big Data Analytics for beginners.wav
MoviePy - Writing audio in audio_output/What is Big Data  Big Data in 2 Minutes  Introduction to Big Data  Big Data Training  Edureka.wavMoviePy - Writing audio in audio_output/MSc Health Data Science.wavMoviePy - Writing audio in audio_output/Meet Claire Monteleoni Editor in Chief of Environmental Data Science.wav


MoviePy - Writing audio in audio_output/What is Data Science - A day in the Life of a Data Scientist by IBM 4.wavMoviePy - Writing audio in audio_output/Bill Squadron – How big data analytics continues to change pro sports.wav

MoviePy - Writing audio in audio_output/Day in the Life Data Scientist.wav
MoviePy 

chunk:  34%|█████████████████▌                                 | 1177/3426 [00:00<00:00, 2814.90it/s, now=None]

MoviePy - Done.
Extracted audio to audio_output/Job Outlook for Data Science  UMBC.wav


                                                                                                               

MoviePy - Done.
Extracted audio to audio_output/Meet Claire Monteleoni Editor in Chief of Environmental Data Science.wav


chunk:  69%|███████████████████████████████████▏               | 2366/3426 [00:00<00:00, 2959.31it/s, now=None]

MoviePy - Done.
Extracted audio to audio_output/What is Big Data  Big Data in 2 Minutes  Introduction to Big Data  Big Data Training  Edureka.wav


                                                                                                               

MoviePy - Done.
Extracted audio to audio_output/Support Vector Machine (SVM) in 2 minutes.wav


                                                                                                               

MoviePy - Done.


                                                                                                               

Extracted audio to audio_output/What is Big Data Analytics.wav
MoviePy - Done.
Extracted audio to audio_output/Big Data Analytics for beginners.wav


                                                                                                               

MoviePy - Done.


                                                                                                               

MoviePy - Done.Extracted audio to audio_output/Big Data Analytics For Business  What is Big Data Analytics  Big Data Training  Simplilearn.wav

Extracted audio to audio_output/Day in the Life Data Scientist.wav


                                                                                                               

MoviePy - Done.
Extracted audio to audio_output/Bill Squadron – How big data analytics continues to change pro sports.wav


                                                                                                               

MoviePy - Done.
Extracted audio to audio_output/MSc Health Data Science.wav


                                                                                                               

MoviePy - Done.
Extracted audio to audio_output/What is Data Science - A day in the Life of a Data Scientist by IBM 4.wav
.................................................................
*  Extracting audio from videos took 1.95 second(s)
.................................................................


In [13]:
audio_output_path = 'audio_output/'
text_output_path  = 'text_output/'

# Transcribe audio to text
run_transcribe_audio(audio_output_path, text_output_path)

Transcribed 'audio_output/Job Outlook for Data Science  UMBC.wav' to 'text_output/Job Outlook for Data Science  UMBC.txt'
Transcribed 'audio_output/Meet Claire Monteleoni Editor in Chief of Environmental Data Science.wav' to 'text_output/Meet Claire Monteleoni Editor in Chief of Environmental Data Science.txt'
Transcribed 'audio_output/Big Data Analytics for beginners.wav' to 'text_output/Big Data Analytics for beginners.txt'
Transcribed 'audio_output/What is Big Data  Big Data in 2 Minutes  Introduction to Big Data  Big Data Training  Edureka.wav' to 'text_output/What is Big Data  Big Data in 2 Minutes  Introduction to Big Data  Big Data Training  Edureka.txt'
Transcribed 'audio_output/Day in the Life Data Scientist.wav' to 'text_output/Day in the Life Data Scientist.txt'
Transcribed 'audio_output/Big Data Analytics For Business  What is Big Data Analytics  Big Data Training  Simplilearn.wav' to 'text_output/Big Data Analytics For Business  What is Big Data Analytics  Big Data Trainin

In [14]:
text_input_path  = 'text_output/'
json_output_path = 'sentiment_analysis/'

# Perform the sentiment analysis on a video's content, extracting its polarity and sensitivity
run_analyze_sentiment(text_input_path, json_output_path)

.................................................................
*  Transcribing text from audio took 0.27 second(s)
.................................................................


In [15]:
input_directory  = 'text_output/'
output_directory = 'translated_text/'

# Translate the text into another language, e.g. Spanish
run_translate_file(input_directory, output_directory)

Translated 'Job Outlook for Data Science  UMBC.txt' to Spanish and saved to 'translated_text/Job Outlook for Data Science  UMBC.txt'
Translated 'What is Big Data  Big Data in 2 Minutes  Introduction to Big Data  Big Data Training  Edureka.txt' to Spanish and saved to 'translated_text/What is Big Data  Big Data in 2 Minutes  Introduction to Big Data  Big Data Training  Edureka.txt'
Translated 'Big Data Analytics for beginners.txt' to Spanish and saved to 'translated_text/Big Data Analytics for beginners.txt'
Translated 'What is Big Data Analytics.txt' to Spanish and saved to 'translated_text/What is Big Data Analytics.txt'
Translated 'Big Data Analytics For Business  What is Big Data Analytics  Big Data Training  Simplilearn.txt' to Spanish and saved to 'translated_text/Big Data Analytics For Business  What is Big Data Analytics  Big Data Training  Simplilearn.txt'
Translated 'Meet Claire Monteleoni Editor in Chief of Environmental Data Science.txt' to Spanish and saved to 'translated_t

In [16]:
input_directory  = 'text_output/' 
output_directory = 'emotion_analysis/'

# Extract the emotions of a text
run_analyze_emotion(input_directory, output_directory)

Processed text_output/What is Big Data  Big Data in 2 Minutes  Introduction to Big Data  Big Data Training  Edureka.txt: Emotions saved to emotion_analysis/What is Big Data  Big Data in 2 Minutes  Introduction to Big Data  Big Data Training  Edureka_emotions.json
Processed text_output/Job Outlook for Data Science  UMBC.txt: Emotions saved to emotion_analysis/Job Outlook for Data Science  UMBC_emotions.json
Processed text_output/Big Data Analytics For Business  What is Big Data Analytics  Big Data Training  Simplilearn.txt: Emotions saved to emotion_analysis/Big Data Analytics For Business  What is Big Data Analytics  Big Data Training  Simplilearn_emotions.json
Processed text_output/What is Big Data Analytics.txt: Emotions saved to emotion_analysis/What is Big Data Analytics_emotions.json
Processed text_output/Meet Claire Monteleoni Editor in Chief of Environmental Data Science.txt: Emotions saved to emotion_analysis/Meet Claire Monteleoni Editor in Chief of Environmental Data Science_