In [49]:
import pandas as pd
import re
import urllib

youtube_urls_df = pd.read_csv('https://s3.amazonaws.com/far-right/fourchan/youtube_urls.csv')

youtube_urls_df.head(5)

Unnamed: 0,url
0,https://www.youtube.com/watch?v=x4iBBfEHNaE
1,https://www.youtube.com/watch?v=uOiHtFVn2JA
2,https://youtu.be/C-GMn7vMZds
3,https://www.youtube.com/watch?v=fDEDQFj9sFk
4,https://youtu.be/k1IaFA2eTPI


In [50]:
youtube_urls_df.sample(n=20)

Unnamed: 0,url
566,https://www.youtube.com/watch?v=khUuLP-XqWI
30861,https://youtu.be/Gn64WPzw6_I
15711,https://youtu.be/_r_32ZRIP4E?t=1m50s
2304,https://www.youtube.com/watch?v=ECAvNSho7gY
35638,https://www.youtube.com/watch?v=BhmBtPegx5Y
12477,https://www.youtube.com/watch?v=fvqQve1dpOk
25980,https://youtu.be/a4fcRoFqoyM
34386,https://www.youtube.com/watch?v=f4CGwSqrGq8
29025,https://www.youtube.com/watch?v=LoaB4gwQEpk
37037,https://www.youtube.com/watch?v=IPGPsgDTW6k


In [73]:
# Regex pattern to grab the URLs with video IDs
# Generally, these are 11-character strings of alphanumeric characters with _ and - mixed in.
id_pattern = '''
    watch\?v=([\w\-]{11})               # Typical URL
    |\/v\/([\w\-]{11})                  # Typical URL Variant
    |youtu\.be\/([\w\-]{11})            # Shortened URL
    |\&amp;v=([\w\-]{11})               # Encoded URL
    |embed\/([\w\-]{11})                # Embedded URL
    |watch\%3Fv\%3D([\w\-]{11})         # Really nasty referral URL
    |savieo\.com\/youtube\/([\w\-]{11}) # Seems to be a site for linking to videos
'''

# Gotta be able to grab share URLs from the rejects`
shared_url_pattern = 'shared\?ci=(.+)'

# Store failures to rummage through...
failed_urls = []

def grab_youtube_id(url):
    """
    Given a URL, attempt to parse out a YouTube video ID.
    Falls back to attempt to resolve a shared video URL if the URL
    matches the pattern for it.
    """
    try:
        # Sift through entire tuple to ensure the non-None match is returned
        found_groups = re.search(id_pattern, url, re.VERBOSE).groups()
        return [g for g in found_groups if g is not None][0]
    except:
        # Try to resolve a share URL to get the final video location
        if re.search(shared_url_pattern, url) is not None:
            resolved_share = urllib.request.urlopen(url)
            try:
                # Grab the actual video id from the resolved share link
                return re.search(id_pattern,
                                 resolved_share.geturl(),
                                 re.VERBOSE).groups()[0]
            except:
                # Away with you!
                failed_urls.append(url)
                return None
        else:
            # Append the rejects to the reject list!
            failed_urls.append(url)
            return None
        
def attempt_url_reconstruct(video_id):
    """
    Reconstructs a typical video URL from a video_id, if it exists
    """
    return 'https://www.youtube.com/watch?v=' + video_id if video_id is not None else None

# Construct new columns using functions
youtube_urls_df['video_id'] = youtube_urls_df['url'].apply(grab_youtube_id)
youtube_urls_df['reconstructed_url'] = youtube_urls_df['video_id'].apply(attempt_url_reconstruct)

# Print out number of lost causes D:
print(len(failed_urls))

753


In [74]:
youtube_urls_df.sample(n=200)

Unnamed: 0,url,video_id,reconstructed_url
3188,https://www.youtube.com/watch?v=FvX9OCwB5-o,FvX9OCwB5-o,https://www.youtube.com/watch?v=FvX9OCwB5-o
5015,https://www.youtube.com/watch?v=y9etfFM6nBE,y9etfFM6nBE,https://www.youtube.com/watch?v=y9etfFM6nBE
2827,https://www.youtube.com/watch?v=ArM7Jfcqr9s,ArM7Jfcqr9s,https://www.youtube.com/watch?v=ArM7Jfcqr9s
4326,https://www.youtube.com/watch?v=A9Wppr9d6aA,A9Wppr9d6aA,https://www.youtube.com/watch?v=A9Wppr9d6aA
21578,https://www.youtube.com/watch?v=8uqisTk3WT8,8uqisTk3WT8,https://www.youtube.com/watch?v=8uqisTk3WT8
16850,https://www.youtube.com/watch?v=94-K3odk0-4,94-K3odk0-4,https://www.youtube.com/watch?v=94-K3odk0-4
35742,https://www.youtube.com/watch?v=wMykYSQaG_c,wMykYSQaG_c,https://www.youtube.com/watch?v=wMykYSQaG_c
5690,http://m.youtube.com/watch?v=8yDarQW7UZc,8yDarQW7UZc,https://www.youtube.com/watch?v=8yDarQW7UZc
23580,https://www.youtube.com/watch?v=DcqglZv4kBE,DcqglZv4kBE,https://www.youtube.com/watch?v=DcqglZv4kBE
32849,https://www.youtube.com/channel/UCSJL4mrS3z_nn...,,


In [75]:
# Remove more common URLs to see if there are any salvageable ones in
# the rejects that justify updating the master regex above
exclude_common = r'youtube\.com\/user|youtube\.com\/channel|youtube\.com\/results|youtube\.com\/playlist'
exclude_pattern = re.compile(exclude_common)
[u for u in failed_urls if not exclude_pattern.search(u)]

['http://www.youtube.com/fringeelements',
 'https://rg3.github.io/youtube-dl/download.html',
 'https://www.youtube.com/watch?v=E5pMSCv',
 'https://board.freedomainradio.com/topic/41159-youtube-the-truth-about-israel-and-palestine/',
 'https://www.youtube.com/watch?v=0eatpiEp7',
 'https://www.youtube',
 'http://www.politico.eu/article/ismail-ilgun-vlogging-the-vote-controversial-dutch-youtuber-roils-elections/',
 'https://socialblade.com/youtube/user/stefbot/monthly',
 'youtube.com/c/sp00nexe',
 'https://socialblade.com/youtube/user/jontronshow/',
 'https://www.theguardian.com/commentisfree/2017/mar/16/youtube-google-hate-speech-moderator',
 'https://www.youtube.com/harmfulopinions',
 'https://socialblade.com/youtube/user/jontronshow',
 'www.youtube.com/watch?v=958EyS…',
 'https://www.youtube.com/watch?v=2OlS0HXs',
 'http://www.youtubemultiplier.com/5363fd3738c46--its-happening-.php',
 'https://youtu.be/MD6oDnm43H',
 'http://www.americanfreedomlawcenter.org/press-release/federal-governm

In [76]:
print('Total URLs: {}'.format(len(youtube_urls_df)))
# Drop URLs that proved useless from the DF
extracted_and_reconstructed = youtube_urls_df.dropna()
print('Number of Extracted Videos: {}'.format(len(extracted_and_reconstructed)))

Total URLs: 40745
Number of Extracted Videos: 39992


In [77]:
extracted_and_reconstructed.rename(index=str, columns={'url': 'original_url', 'video_id': 'youtube_id'})

Unnamed: 0,original_url,youtube_id,reconstructed_url
0,https://www.youtube.com/watch?v=x4iBBfEHNaE,x4iBBfEHNaE,https://www.youtube.com/watch?v=x4iBBfEHNaE
1,https://www.youtube.com/watch?v=uOiHtFVn2JA,uOiHtFVn2JA,https://www.youtube.com/watch?v=uOiHtFVn2JA
2,https://youtu.be/C-GMn7vMZds,C-GMn7vMZds,https://www.youtube.com/watch?v=C-GMn7vMZds
3,https://www.youtube.com/watch?v=fDEDQFj9sFk,fDEDQFj9sFk,https://www.youtube.com/watch?v=fDEDQFj9sFk
4,https://youtu.be/k1IaFA2eTPI,k1IaFA2eTPI,https://www.youtube.com/watch?v=k1IaFA2eTPI
5,https://www.youtube.com/watch?v=7liwQ8orNsY,7liwQ8orNsY,https://www.youtube.com/watch?v=7liwQ8orNsY
6,https://www.youtube.com/watch?v=XOibIxl3dLo,XOibIxl3dLo,https://www.youtube.com/watch?v=XOibIxl3dLo
7,https://www.youtube.com/watch?v=-SoIVS_gCZI,-SoIVS_gCZI,https://www.youtube.com/watch?v=-SoIVS_gCZI
8,https://www.youtube.com/watch?v=xKOQdcVoRys,xKOQdcVoRys,https://www.youtube.com/watch?v=xKOQdcVoRys
9,https://www.youtube.com/watch?v=fsF7enQY8uI,fsF7enQY8uI,https://www.youtube.com/watch?v=fsF7enQY8uI


In [78]:
extracted_and_reconstructed.to_csv('parsed_urls.csv')