In [1]:
import pandas as pd
import numpy as np
import re
import time
from urllib.parse import urlparse

In [2]:
def extract_urls(s):
    """Find all URLs in a string and return them in a list."""
    if pd.isnull(s):
        return []
    url_pattern = re.compile(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    )
    urls = re.findall(url_pattern, s)
    return [url.replace("[", "").replace("]", "").replace("(", "").replace(")", "") for url in urls]


In [3]:
def get_domain(url):
    """Get the domain of a URL"""
    parsed_uri = urlparse(url)
    domain = '{uri.netloc}'.format(uri=parsed_uri)
    return domain

In [4]:
tic = time.perf_counter()

data = pd.read_csv('extracted_RC_2022-11.csv0.csv')
sites = pd.read_excel('news_sites.xlsx')
sites_set = set(sites['domain'])

# Drop NaN and reset index
data = data.dropna(subset = ['body']).reset_index(drop=True)

data['urls'] = data['body'].apply(extract_urls)

urls_data_list = []

for i, row in data.iterrows():
    urls = row['urls']
    if urls:  # only process rows where URLs are found
        domains = list(map(get_domain, urls))
        for url, domain in zip(urls, domains):
            if domain in sites_set:
                urls_data_list.append({'author': row['author'],'author_fullname': row['author_fullname']
                                       ,'id':row['id'],'parent_id':row['parent_id'],'body':row['body']
                                       ,'score':row['score'],'subreddit':row['subreddit']
                                       ,'subreddit_id':row['subreddit_id']
                                       ,'url': url, 'domain': domain})

# Create new DataFrame for URLs and domains
urls_data = pd.DataFrame(urls_data_list)

# Merge new DataFrame with the sites DataFrame to get the domain properties
urls_data = pd.merge(urls_data, sites, on='domain', how='left')
toc = time.perf_counter()
print(f"code run in {toc - tic:0.4f} seconds")

urls_data

code run in 43.3629 seconds


Unnamed: 0,author,author_fullname,id,parent_id,body,score,subreddit,subreddit_id,url,domain,fake,lowcred,reputable,satire
0,AutoModerator,t2_6l4z3,iukc3a4,t3_yit2sh,"**For your safety, we recommend you to decline...",1,Puberty,t5_2xw8u,https://reddit.com/r/Puberty/comments/so2ho8/h...,reddit.com,0,0,1,0
1,AutoModerator,t2_6l4z3,iukc5kp,t3_yit3d7,"**Hello, is this thing on? Join our official D...",1,TeensMeetTeens,t5_20ywkr,https://reddit.com/message/compose?to=/r/Teens...,reddit.com,0,0,1,0
2,AutoModerator,t2_6l4z3,iukc6qq,t3_yit3l0,Join the [discord server](https://discord.gg/w...,1,AceAttorneyCirclejerk,t5_39d8y,https://reddit.com/r/AceAttorneyCirclejerk/com...,reddit.com,0,0,1,0
3,AutoModerator,t2_6l4z3,iukc78f,t3_yit3of,Welcome back to r\/RandomActsOfBlowJob!\n\n[Ot...,1,RandomActsOfBlowJob,t5_2tpfa,https://reddit.com/r/RandomActsOfBlowJob/searc...,reddit.com,0,0,1,0
4,AutoModerator,t2_6l4z3,iukc78f,t3_yit3of,Welcome back to r\/RandomActsOfBlowJob!\n\n[Ot...,1,RandomActsOfBlowJob,t5_2tpfa,https://reddit.com/r/RandomActsOfBlowJob/searc...,reddit.com,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4555,BubbaTee,t2_5hb1w,iuncmsq,t1_iun2tfu,Covid has disproportionately killed black Amer...,-1,nba,t5_2qo4s,https://abcnews.go.com/Politics/coronavirus-di...,abcnews.go.com,0,0,1,0
4556,WikiSummarizerBot,t2_bci24ojc,iuncoc4,t1_iuncmoi,**[1828 United States presidential election in...,1,YAPms,t5_21oyw3,https://reddit.com/message/compose?to=WikiSumm...,reddit.com,0,0,1,0
4557,drainmond,t2_2mzg9ppc,iuncoo5,t1_iukdzau,Check out: [https://medium.com/@mjbleong/87e06...,2,creativewriting,t5_2r69u,https://medium.com/@mjbleong/87e061e18bafhttps...,medium.com,0,0,1,0
4558,Azsunyx,t2_63lot,iuncoug,t1_iunbiz9,He had 18 years of recovery post stroke. Cogni...,1,PoliticalHumor,t5_2qm21,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,www.ncbi.nlm.nih.gov,0,0,1,0
