# URL Detector

In [13]:
import os
import cv2
import numpy as np

# Preprocessing
from PIL import Image, ImageEnhance, ImageOps, ImageDraw, ImageFont
from tensorflow.keras.preprocessing import image

# CNN
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import load_model

# OCR
import pytesseract
import re

# Info
from datetime import timedelta
from tqdm import tqdm
from termcolor import colored

# Define necessary functions

<h3>split_video()</h3>

Takes in a video and splits it into frames.

<b>Parameters:</b>

- file(str): Path to video file to split
- increment(int): The number of frames the function should skip between each capture

In [2]:
def split_video(file, increment):
    
    position = 0
    current_iteration = 0
    
    if file.endswith('.mp4'):
        # Load video
        video_path = os.path.join(input_path, file)
        capture = cv2.VideoCapture(video_path)

        # Get the total number of frames in the video + Create progress bar
        total_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
        pbar = tqdm(total=total_frames // increment, desc=f'[1/3]Processing video: {file}', position=position)

        # Loop through every nth frame in the video
        frameNum = 0

        while True:
            # Skip frames that do not need to be read
            if frameNum % increment != 0:
                ret = capture.grab()
                if not ret:
                    break
                frameNum += 1
                continue

            # Read the next frame
            ret, frame = capture.read()

            # Exit if reached last frame
            if not ret:
                break

            # Save to backgrounds directory
            milis = (capture.get(cv2.CAP_PROP_POS_MSEC)) * 0.001
            timestamp = str(timedelta(seconds=milis)).replace(':', '-')
            
            frame_path = f'{os.path.join(process_path, timestamp)}.jpg'
            cv2.imwrite(frame_path, frame)

            # Increment frame
            frameNum += 1
            pbar.update(1)
            pbar.set_postfix(frame=frameNum)

        # Release capture
        capture.release()
        pbar.close()

<h3>clear()</h3>

The clear function deletes every file in the process directory. This is to be used after the URL is already extracted to free space, as the frames are no longer needed.

In [3]:
def clear():
    """
    Remove all temporary files in the specified directory.
    """
    for file in os.listdir(process_path):
        os.remove(os.path.join(process_path, file))

<h3> preprocess_images() </h3>
    
preprocesses images for cnn use

<b>Parameters:</b>

- _dir(str): Path to input directory

In [4]:
# Load and preprocess all images in dir
def preprocess_images(_dir):
    image_files = os.listdir(_dir) 
    images = []
    for image_file in image_files:
        img = image.load_img( os.path.join(_dir, image_file) , target_size=(224, 224))
        img_array = image.img_to_array(img) 
        img_array = np.expand_dims(img_array, axis=0)
        img_array = tf.keras.applications.mobilenet.preprocess_input(img_array)
        images.append(img_array)
    return np.vstack(images)

<h3> predict_url() </h3>

Uses pytesseract to predict the text and bounding boxes of the url contained in the image if any.  

<b>Parameters:</b>

- img(str): Path to image file
- spread(float): Number of pixels to add to bounding box coordinates for displaying URL text

In [5]:
def predict_url(img, spread):
    img = Image.open(img)
    
    ## Process the Image
    # Greyscale
    img = img.convert('L')

    # Threshold
    img = img.point( lambda p: 255 if p > 180 else 0)

    ## Predict URL
    text = pytesseract.image_to_string(img)
    # Predict bbox
    word_box = pytesseract.image_to_boxes(img).split('\n')
    
    # Remove non URL text
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    urls = url_pattern.findall(text)
    
    ## Bbox stuff
    
    # Find word
    if len(urls) > 0:
        url = urls[0]
        url_box = None

        score = 0
        url_array = []
        for box in word_box:
            if score != len(url):
                if box.split(' ')[0] == url[score]:
                    score += 1
                    url_array.append(box.split(' '))
                else:
                    score = 0
                    url_array = []

        # Find bounding box of full url
        h = img.height

        x1, y1, x2, y2 = int(url_array[0][1]) - spread, h - int(url_array[0][2]) - spread, int(url_array[-1][3]) + spread, h - int(url_array[-1][4]) + spread

        return [urls, (x1, y1, x2, y2)]
    else: return['null','null']

<h3> grab_urls() </h3>

Extracts URLs from overlayed frames of a video file and displays the URLs on the frames.  

<b>Parameters:</b>
- video (str): path to video file  
- increment (int): number of skipped frames between extracted frames (default: 60)  
- spread (float): number of pixels to add to bounding box coordinates for displaying URL text (default: 35)  
- cleanup (bool): whether to remove temporary files created by function (default: True)  
- uniqueonly (bool): whether to remove redundant entries (default: True)  
- preview (bool): whether to display preview images with URLs and bounding boxes (default: False)  
- output_path (str): path to save preview images (default: 'output') 

In [6]:
def grab_urls(video, increment=60, spread=35, cleanup=True, uniqueonly=True, preview=False, output_path='output'):
    
    assert os.path.isfile(os.path.join(input_path, video))
    
    # Split video into frames
    split_video(video, increment)
    
    # Preprocess images for CNN use
    preprocessed_images = preprocess_images(process_path)
    
    # Make predictions
    print('[2/3]Predicting overlayed frames...')
    predictions = model.predict(preprocessed_images)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Identify predicted overlayed frames
    directory = os.listdir(process_path)
    overlayed = [directory[i] for i, label in enumerate(predicted_labels) if label == 1]
    
    # Extract URLs
    urls = []
    bboxes = []
    times = []
    nulls = []
    
    print('[3/3]Predicting URLs...')
    for img in overlayed:
        # Predict URL
        URL = predict_url(os.path.join(process_path, img), spread=spread)
        if URL[0] != 'null':
            if (URL[0] not in urls) or not uniqueonly:
                urls.append(URL[0])
                bboxes.append(URL[1])
                times.append(img.replace('-', ':').replace('.jpg', ''))
        else:
            nulls.append(img.replace('-', ':').replace('.jpg', ''))
    
    # Display information
    
    #times = [timestamp.replace('-', ':').replace('.jpg', '') for timestamp in overlayed]
    times_str = ' , '.join(times)
    
    print(f'\n\nFound {len(times)} overlayed frames at timestamps: {times_str}')
    for url, bbox in zip(urls, bboxes):
        print(url, bbox)
        
    # Display image
    if preview:
        font = ImageFont.truetype('arial.ttf', 27)
        for img, url, bbox in zip(times, urls, bboxes):
            prev = Image.open(os.path.join(process_path, f"{img.replace(':', '-')}.jpg"  )).copy()
            draw = ImageDraw.Draw(prev)
            draw.rectangle(bbox, outline="red", width=5)
            draw.text((bbox[0] + spread, bbox[1] - 35), str(url), font=font, fill='red')
            prev.save(os.path.join(output_path, f"{video.replace('.mp4', '')}  {img.replace(':', '-')}.jpg"))
    
    # Remove temporary files if cleanup is enabled
    if cleanup:
        clear()
        
    return [times,urls,bboxes, nulls, '']

    

In [7]:
def print_json(_json_):
    print('\n█════════════════════════════════════════════════█ JSON OUTPUT █════════════════════════════════════════════════█\n')
    for filename, data in _json_.items():
        print('---------------------\n',filename,'\n---------------------\n')

        # Iterate over the keys in the data dictionary for the current file
        for key, values in data.items():
            print(key,':')

            if key == 'nulls':
                print(colored(' , '.join(values), 'red'),'\n') if  len(values) > 0 else print(colored('none','green'),'\n') 

            else:
                print('   URL: ', ' , '.join(values['url']) )
                print('   Coords: ', values['coords'] )
                print()
    

# Make Predictions

In [14]:
import requests
import json

from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer

from reportlab.platypus import Image as RLImage

from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont

from bs4 import BeautifulSoup

In [17]:
model = load_model(os.path.join('resources','URL_Grabber_model_v2.h5'))

In [10]:
# Make sure needed directories actually exists

if os.path.isdir('input') is False:
    os.mkdir('input')
if os.path.isdir('process') is False:
    os.mkdir('process')
if os.path.isdir('output') is False:
    os.mkdir('output')

input_path = 'input'
process_path = 'process'

In [18]:
# Create dict
keyeddata = {}
for video in os.listdir(input_path):

    urldata = grab_urls(video, preview=True)
    
    # Create entry for video + add nulls
    keyeddata[video] = {}
    keyeddata[video]['nulls'] = urldata[3]
    
    # Add info for each respective timestamp
    for i in range(len(urldata[0])):
        keyeddata[video][urldata[0][i]] = dict(zip(['url','coords'],[urldata[1][i],urldata[2][i]]))

# Save JSON
with open("output/url data.json", "w") as output:
    json.dump(keyeddata, output)
    
# Print it out in a way humans can read without sprouting a brain tumor
print_json(keyeddata)

[1/3]Processing video: 1665515252406.mp4: 76it [00:06, 11.17it/s, frame=4501]                                          


[2/3]Predicting overlayed frames...
[3/3]Predicting URLs...


Found 3 overlayed frames at timestamps: 0:01:12 , 0:01:20 , 0:01:28
['https://json-schema.org'] (-1, 632, 362, 688)
['https://cuelang.org'] (-1, 632, 307, 689)
['https://marshmallow.readthedocs.io/en/stable'] (-1, 632, 643, 689)


[1/3]Processing video: 1671575434239.mp4: 44it [00:03, 12.06it/s, frame=2581]                                          


[2/3]Predicting overlayed frames...
[3/3]Predicting URLs...


Found 1 overlayed frames at timestamps: 0:01:20
['https://github.com'] (0, 592, 301, 649)

█════════════════════════════════════════════════█ JSON OUTPUT █════════════════════════════════════════════════█

---------------------
 1665515252406.mp4 
---------------------

nulls :
[31m0:00:50 , 0:00:52 , 0:01:18 , 0:01:24 , 0:01:26 , 0:01:42 , 0:01:44 , 0:02:14 , 0:02:16[0m 

0:01:12 :
   URL:  https://json-schema.org
   Coords:  (-1, 632, 362, 688)

0:01:20 :
   URL:  https://cuelang.org
   Coords:  (-1, 632, 307, 689)

0:01:28 :
   URL:  https://marshmallow.readthedocs.io/en/stable
   Coords:  (-1, 632, 643, 689)

---------------------
 1671575434239.mp4 
---------------------

nulls :
[32mnone[0m 

0:01:20 :
   URL:  https://github.com
   Coords:  (0, 592, 301, 649)



# Inspect URLs

In [11]:
def check_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return True
        else:
            return False
    except:
        return False

In [12]:
def get_title_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.title.string.strip()
    simplified_title = title.split(':')[0].split(' - ')[0].split(' | ')[0].split(' – ')[0]
    return simplified_title

In [13]:
with open('output/url data.json', 'r') as data:
    json_data = json.load(data)

In [14]:
working = []
broken = []
for filename, data in json_data.items():
    #print(f'{filename}:')
    for key, values in data.items():
        if key != 'nulls':
            for url in values['url']:
                status = check_url(url)
                #print(f"   {colored('works!', 'green') if check_url(url) else colored('broken', 'red')}  {url}")
                
                (working if status else broken).append(url)
                
print('found a total of', colored(len(working), 'green'),'working, and', colored(len(broken), 'red'),'broken:\n\n',broken)

found a total of [32m4[0m working, and [31m0[0m broken:

 []


# Create PDF

In [57]:
# Define font paths
folder = os.path.join('resources','fonts')
fontFiles = [{'SSP-regular': os.path.join(folder, 'SourceSansPro-Light.ttf')},
             {'SSP-bold': os.path.join(folder, 'SourceSansPro-Semibold.ttf')},
             {'SSP-italic': os.path.join(folder, 'SourceSansPro-Lightit.ttf')},
             {'SSP-bolditalic': os.path.join(folder, 'SourceSansPro-Semiboldit.ttf')}]

# Register fonts
for fontFile in fontFiles:
    (fontName, filePath), = fontFile.items()
    print(fontName, filePath)
    pdfmetrics.registerFont(TTFont(fontName, filePath))
    
# Create styles
styles = {
    'default': ParagraphStyle(
        'default',
        fontName='SSP-regular',
        fontSize=12,
        leading=14,
    )
}

# Create tags for easy use
tags = {
    'b': "<font name='SSP-bold'>",
    'i': "<font name='SSP-italic'>",
    'bi': "<font name='SSP-bolditalic'>",
}


SSP-regular resources\fonts\SourceSansPro-Light.ttf
SSP-bold resources\fonts\SourceSansPro-Semibold.ttf
SSP-italic resources\fonts\SourceSansPro-Lightit.ttf
SSP-bolditalic resources\fonts\SourceSansPro-Semiboldit.ttf


In [77]:
doc = SimpleDocTemplate("output/links.pdf", pagesize=letter, leftMargin=45)

DOCUMENT = []

# add the image to the document
header_image = Image("resources/images/demo header.png", width=letter[0], height=letter[1]*0.16 )
header_image._offs_y = 80
header_image._offs_x = 10
DOCUMENT.append(header_image)
DOCUMENT.append(Spacer(0, -60))

# create a hyperlink
for filename, data in json_data.items():
    for key, values in data.items():
        if key != 'nulls':
            url = values['url'][0]
            
            text = f"{tags['b']}{get_title_from_url(url)} –</font> <a href='{url}'> {url} </a> <br /><br />"
            DOCUMENT.append(Paragraph(text, styles['default']))

doc.build(DOCUMENT)