-
Notifications
You must be signed in to change notification settings - Fork 0
/
InvestigatorOPP.py
151 lines (115 loc) · 6.43 KB
/
InvestigatorOPP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from bs4 import BeautifulSoup
from bs4.element import AttributeValueWithCharsetSubstitution
import requests
import lxml
import openpyxl
from bs4 import BeautifulSoup
from bs4.element import AttributeValueWithCharsetSubstitution
##-------------------------------------------------------------------------------------------------------
class website():
def __init__(self, name, searchLink, link, tag, chosenClass, scrapingType):
self.name = name
self.searchLink = searchLink
self.link = link
self.tag = tag
self.chosenClass = chosenClass
self.scrapingStyle = scrapingType
#Returns = same argument or self.link + argument
#If the argument does not start with http or https, it adds self.link to it.
#Meant to check if the element obtained in scraping is actually a link or an ID.
def checkIfHasLink(self, argument):
if ( (argument.startswith("http:/") == True) or (argument.startswith("https:/") == True) ):
return(argument)
else:
return(self.link + argument)
#Returns a list of lists, these lists contain 2 elements, a headline and a link.
#List -> Lists -> Head and Link
def scraping(self, argument):
# This variable is defined to be able to reset the value of searchLink at the end of the function
originalSearchLink = self.searchLink
# "~°Ñ°~" is placed where the search term should be, so we can replace it with the argument here
self.searchLink = self.searchLink.replace("~°Ñ°~", argument)
r = requests.get(self.searchLink)
soup = BeautifulSoup(r.content, "lxml")
contentO = []
if (self.scrapingStyle == "basic") :
#Neither the .get("href") and ["href"] methods were working, just kinda made my own
allHtml = soup.find_all(f"{self.tag}", class_= f"{self.chosenClass}")
for x in allHtml:
head = (x.get_text()).strip()
x = (str(x)).split('"')
#After getting the headline, if for some reason getting a link is impossible, it breaks.
try:
indexNumber = x.index(" href=") + 1
#Best way of getting the index of the link, due to href being one element away from it.
except: break
link = (self.checkIfHasLink(x[indexNumber]))
#Appending a list with the headline and link.
contentO.append([head, link])
#Another way to get everything, more efficient but its not always supported.
elif (self.scrapingStyle == "select"):
for x in range(10):
cssSelector = (self.tag).replace("~°Ñ°~", f"{x}") #Setting up the selector.
selectedHtml = soup.select(cssSelector) #Selecting.
if type(cssSelector) == str: #If it is a string, append a list with the headline and the link.
for x in selectedHtml:
contentO.append( [(x.get_text()).strip(), (self.checkIfHasLink(x.get("href")))] )
#Gets the text of the element and strips it, gets the link and uses the check if has link function on it.
#Resets the value of self.searchLink
self.searchLink = originalSearchLink
return(contentO)
#-------------------------------------------------------------------------------------------------------
scholar = website( "Scholar", "https://scholar.google.com/scholar?hl=es&as_sdt=0%2C5&q=~°Ñ°~&btnG=", "",
"h3", "gs_rt", "basic")
researchgate = website("ResearchGate","https://www.researchgate.net/search/publication?q=~°Ñ°~", "https://www.researchgate.net/", "div",
"nova-legacy-e-text nova-legacy-e-text--size-l nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit nova-legacy-v-publication-item__title",
"basic")
basesearch = website("Bielefeld-Academic-Search-Engine", "https://www.base-search.net/Search/Results?lookfor=~°Ñ°~&name=&oaboost=1&newsearch=1&refid=dcbasen",
"https://www.base-search.net/" ,"a" ,"bold", "basic")
pubmed = website("Pubmed", "https://pubmed.ncbi.nlm.nih.gov/?term=~°Ñ°~", "https://pubmed.ncbi.nlm.nih.gov/",
"article.full-docsum:nth-child(~°Ñ°~) > div:nth-child(2) > div:nth-child(1) > a:nth-child(1)", "", "select")
elsevier = website("Elsevier", "https://www.elsevier.com/search-results?query=~°Ñ°~", "https://www.elsevier.com/",
"article.search-result:nth-child(~°Ñ°~) > header:nth-child(1) > h2:nth-child(1) > a", "", "select")
libgen = website("Libgen", "https://libgen.is/scimag/?q=~°Ñ°~",
"https://libgen.is", ".catalog > tbody:nth-child(2) > tr:nth-child(~°Ñ°~) > td:nth-child(2) > p:nth-child(1) > a:nth-child(1)", "", "select")
listOfPages = [scholar, researchgate, pubmed, elsevier, libgen, basesearch]
#-------------------------------------------------------------------------------------------------------
def progressBar(page):
length = listOfPages.index(page)
print(f"""
------------------Progress------------------
:::::::::::::::::::::{round ((length * 16.7), 2)}%:::::::::::::::::
--------------------------------------------
""")
#-------------------------------------------------------------------------------------------------------
def writing(argument):
#creating the xlsx file
my_wb = openpyxl.Workbook()
my_sheet = my_wb.active
#putting the number of the search results
for n in range(1, 46):
if (n % 3 == 0):
my_sheet.cell(row=n, column=1).value = (n/3)
rowCounter = 1
columnCounter = 2
for webpage in listOfPages:
my_sheet.cell(row= 1, column= columnCounter).value = webpage.name
Returns = webpage.scraping(argument)
rowCounter = 3
for x in Returns:
my_sheet.cell(row = rowCounter, column = columnCounter).value = x[0]
my_sheet.cell(row = (rowCounter + 1), column = columnCounter).value = x[1]
rowCounter += 3
columnCounter += 1
progressBar(webpage)
my_wb.save("INVESTIGATOR.xlsx")
print(f"""
------------------Progreso------------------
:::::::::::::::::::::100%:::::::::::::::::::::
--------------------------------------------
""")
#-------------------------------------------------------------------------------------------------------
def initialization():
inp = input("Enter your search term: ")
writing(inp)
initialization()