<a href="https://colab.research.google.com/github/Dawudis/ComCrawl-Meta-Data-Extraction/blob/main/Dawud's_(2_20_22)_using_comcrawl_package_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Comcrawling the Articles**

In [None]:
!pip install comcrawl

In [None]:
import pandas as pd
from comcrawl import IndexClient

In [None]:
#the IndexClient date slot refers to the time of the crawl
#you can find this in Common Crawl's Index Server
client = IndexClient(["2020-10"]) #2020-10 refers to February of 2020
#put in the website you wish to crawl
client.search("communityimpact.com/*")

In [None]:
df = pd.DataFrame(client.results)

In [None]:
df.head()

Unnamed: 0,urlkey,timestamp,url,mime,mime-detected,status,digest,length,offset,filename,redirect,languages,encoding
0,"com,communityimpact)/",20200222222042,http://communityimpact.com/,text/html,text/html,301,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,584,1709926,crawl-data/CC-MAIN-2020-10/segments/1581875145...,https://communityimpact.com/,,
1,"com,communityimpact)/",20200222222042,https://communityimpact.com/,text/html,text/html,200,Q6NM7LCPD74GTA7QV7I6LPCZEYPFWPVW,19371,332083353,crawl-data/CC-MAIN-2020-10/segments/1581875145...,,eng,UTF-8
2,"com,communityimpact)/",20200222222042,https://www.communityimpact.com/,text/html,text/html,301,H3MHF6R2ACBXXMAIVWOSAGCQ4LVFPJ47,748,29853390,crawl-data/CC-MAIN-2020-10/segments/1581875145...,http://communityimpact.com/,,
3,"com,communityimpact)/2013/02/13/peoples-signat...",20200225023848,http://communityimpact.com/2013/02/13/peoples-...,text/html,text/html,301,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,611,1852266,crawl-data/CC-MAIN-2020-10/segments/1581875146...,https://communityimpact.com/2013/02/13/peoples...,,
4,"com,communityimpact)/2013/02/13/peoples-signat...",20200225023848,https://communityimpact.com/2013/02/13/peoples...,text/html,text/html,404,6FBU53S3XORGYAY2F6Z7WDNYGVXE4NFZ,18269,16292257,crawl-data/CC-MAIN-2020-10/segments/1581875146...,,,


# **Dataset Pre-Processing**

In [None]:
#ComCrawl gives lots of information within each row, but for this project, we only need the url columns
df.drop(['urlkey','timestamp','mime','mime-detected','status','digest','length','offset','filename','languages','encoding','redirect'], axis=1, inplace=True)

In [None]:
#taking a closer look, there are many duplicate urls with https/http difference
#so we drop any row that doesn't have https
data = df[df['url'].str.contains('https', na = False)]

In [None]:
#drop any extra duplicates
data1 = data.drop_duplicates()

In [None]:
data1.head()

Unnamed: 0,url
1,https://communityimpact.com/
2,https://www.communityimpact.com/
4,https://communityimpact.com/2013/02/13/peoples...
6,https://communityimpact.com/2013/04/30/luxury-...
8,https://communityimpact.com/2014/08/20/houston...


# **Meta-Data Extraction**

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

In [None]:
#code to get meta-titles
titles = []

for i in data1['url']:
  reqs = requests.get(i)
  soup = BeautifulSoup(reqs.text, 'html.parser')
  title = soup.find("meta", property="og:title")
  titles.append(title["content"] if title else "No meta title given")

In [None]:
#make a column 'titles' with the extracted titles
data1['titles'] = titles

In [None]:
#code to get meta-descriptions
descriptions = []

for i in data1['url']:
  reqs = requests.get(i)
  soup = BeautifulSoup(reqs.text, 'html.parser')
  description = soup.find("meta", property="og:description")
  descriptions.append(description["content"] if description else "No meta description given")

In [None]:
#make a column 'descriptions' with the extracted descriptions
data1['descriptions'] = descriptions

In [None]:
data1.head()

Unnamed: 0,url,titles,descriptions
1,https://communityimpact.com/,No meta title given,No meta description given
2,https://www.communityimpact.com/,No meta title given,No meta description given
4,https://communityimpact.com/2013/02/13/peoples...,People's Signature Flooring,People's Signature Flooring
6,https://communityimpact.com/2013/04/30/luxury-...,Luxury car auction set for resort in Montgomery,Luxury car auction set for resort in Montgomery
8,https://communityimpact.com/2014/08/20/houston...,Houston battles human trafficking,Houston battles human trafficking
