forked from vitorfs/bootcamp
-
Notifications
You must be signed in to change notification settings - Fork 3
/
metadatareader.py
executable file
·125 lines (109 loc) · 4.71 KB
/
metadatareader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import re
import subprocess
from subprocess import TimeoutExpired
from bs4 import BeautifulSoup, Comment
from urllib.parse import urljoin
class Metadata:
url = ""
type = "" # https://ogp.me/#types
title = ""
description = ""
image = ""
def __str__(self):
return "{url: " + self.url + ", type: " + self.type + ", title: " + self.title + ", description: " + self.description + ", image: " + self.image + "}"
class Metadatareader:
@staticmethod
def get_metadata_from_url_in_text(text):
# look for the first url in the text
# and extract the url metadata
urls_in_text = Metadatareader.get_urls_from_text(text)
if len(urls_in_text) > 0:
return Metadatareader.get_url_metadata(urls_in_text[0])
return Metadata()
@staticmethod
def get_urls_from_text(text):
# look for all urls in text
# and convert it to an array of urls
regex = r"(?:(?:https?|ftp):\/\/|\b(?:[a-z\d]+\.))(?:(?:[^\s()<>]+|\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))?\))+(?:\((?:[^\s()<>]+|(?:\(?:[^\s()<>]+\)))?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))?"
return re.findall(regex, text)
@staticmethod
def get_url_metadata(url):
# get final url after all redirections
# then get html of the final url
# fill the meta data with the info available
url = Metadatareader.get_final_url(url)
url_content = Metadatareader.get_url_content(url)
soup = BeautifulSoup(url_content, "html.parser")
metadata = Metadata()
metadata.url = url
metadata.type = "website"
for meta in soup.findAll("meta"):
# priorize using Open Graph Protocol
# https://ogp.me/
metadata.type = Metadatareader.get_meta_property(meta, "og:type", metadata.type)
metadata.title = Metadatareader.get_meta_property(meta, "og:title", metadata.title)
metadata.description = Metadatareader.get_meta_property(meta, "og:description", metadata.description)
metadata.image = Metadatareader.get_meta_property(meta, "og:image", metadata.image)
if metadata.image:
metadata.image = urljoin(url, metadata.image)
if not metadata.title and soup.title:
# use page title
metadata.title = soup.title.text
if not metadata.image:
# use first img element
images = soup.find_all('img')
if len(images) > 0:
metadata.image = urljoin(url, images[0].get('src'))
if not metadata.description and soup.body:
# use text from body
for text in soup.body.find_all(string=True):
if text.parent.name != 'script' and text.parent.name != 'style' and not isinstance(text, Comment):
metadata.description += text
if metadata.description:
# remove white spaces and break lines
metadata.description = re.sub('\n|\r|\t', ' ', metadata.description)
metadata.description = re.sub(' +', ' ', metadata.description)
metadata.description = metadata.description.strip()
return metadata
@staticmethod
def get_final_url(url, timeout=5):
# get final url after all redirections
# get http response header
# look for the "Location: " header
proc = subprocess.Popen([
"curl",
"-Ls",#follow redirect 301 and silently
"-I",#don't download html body
url
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
try:
out, err = proc.communicate(timeout=timeout)
except TimeoutExpired:
proc.kill()
out, err = proc.communicate()
header = str(out).split("\\r\\n")
for line in header:
if line.startswith("Location: "):
return line.replace("Location: ", "")
return url
@staticmethod
def get_url_content(url, timeout=5):
# get url html
proc = subprocess.Popen([
"curl",
"-i",
"-k",#ignore ssl certificate requisite
"-L",#follow redirect 301
url
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
try:
out, err = proc.communicate(timeout=timeout)
except TimeoutExpired:
proc.kill()
out, err = proc.communicate()
return out
@staticmethod
def get_meta_property(meta, property_name, default_value=""):
if 'property' in meta.attrs and meta.attrs['property'] == property_name:
return meta.attrs['content']
return default_value