-
Notifications
You must be signed in to change notification settings - Fork 16
/
link_checker_report.py
235 lines (189 loc) · 8.76 KB
/
link_checker_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import pandas as pd
import argparse
from portality import models
from portality.crosswalks.journal_form import JournalFormXWalk
from datetime import datetime
import os
"""
This script is the second step of generating csv file with the details validity of the urls in the journal.
First step is to execute 'journal_urls' script.
Execute this by passing required input files to generate a report of the urls along with journals information.
Steps to generate the report csv file:
1. Run journal_urls.py -> python portality/scripts/journal_urls.py
2. The above scripts will generate doaj_journals_links.csv file at the location mention in the config for
'STORE_LOCAL_DIR'. Example location 'local_store/main/doaj_journals_links.csv'
3. This script also generates the HTML files at the location from where the script is executed
4. Input the html files to link checker tool (https://www.drlinkcheck.com/) by copying the html files to the server and mention the location to the
link checker tool
5. Run the link check on the tool and export the csv file to a local location
6. Run link_checker_report.py by passing the file locations as parameters.
ex: python portality/scripts/link_checker_report.py --file <links-doaj-link-check-test-2023-05-11_13-31-59.csv>
--journal_csv_file <local_store/main/doaj_journals_links.csv>
Provide the absolute paths for the files
7. Once the above script is run, final report csv file will be generated
"""
def log(msg):
print("[{x}] {y}".format(x=datetime.utcnow().strftime("%y-%m-%dT%H:%M:%SZ"), y=msg))
def write_results(df, filename='multi_result.csv'):
"""
Write results to a csv file
:param df: DataFrame object of the csv file
:param filename: Output file name
:return:
"""
# Sort the results by the original index
df_sorted = df.sort_values(by='Journal title')
df_sorted.to_csv(filename, index=False)
print("Result CSV file has been written.")
def _get_link_type(link, journal):
form = JournalFormXWalk.obj2form(journal)
locations = []
subs = []
for k, v in form.items():
if v is None:
continue
if isinstance(v, list):
if link in v:
locations.append(k)
else:
for e in v:
if isinstance(e, dict):
for sk, sv in e.items():
if not isinstance(sv, str):
continue
if link == sv:
locations.append(sk)
elif sv.startswith(link):
subs.append(sk)
else:
if e.startswith(link):
subs.append(k)
break
else:
if not isinstance(v, str):
continue
if v == link:
locations.append(k)
elif v.startswith(link):
subs.append(k)
return locations + subs
def fetch_matching_rows(journal_url_index, report_values):
"""Check with journals dataframe and retrieve matching rows with url.
:param df: DataFrame
:param report_values: url to match
:return: DataFrame with matching rows
"""
# Search for the text in the entire csv file
#mask = df.applymap(lambda x: report_values["url"] in str(x))
# Get the rows where the text is found
#df_result = df[mask.any(axis=1)]
journal_data = journal_url_index.get(report_values["url"])
# if not df_result.empty:
if journal_data is not None:
# columns = ['Journal title', 'Added on Date', 'Last updated Date', "Journal ID"]
# Select the desired columns from the DataFrame
# df_result_selected_columns = df_result[columns].copy() # create a copy to avoid SettingWithCopyWarning
df_result_selected_columns = pd.DataFrame(
data=[list(journal_data)],
columns=['Journal title', 'Added on Date', 'Last updated Date', "Journal ID", "Publisher",
"Country of publisher" ]
)
jid = df_result_selected_columns["Journal ID"].values[0]
journal = models.Journal.pull(jid)
primary_type = ""
question_link = ""
types = []
if journal is not None:
types = _get_link_type(report_values["url"], journal)
if len(types) > 0:
primary_type = types[0]
question_link = "https://doaj.org/admin/journal/" + jid + "#question-" + primary_type
# Add more columns to the DataFrame
df_result_selected_columns["DOAJ Form"] = "https://doaj.org/admin/journal/" + jid
df_result_selected_columns["Form Field"] = question_link
df_result_selected_columns['Url'] = report_values["url"]
df_result_selected_columns['Type'] = primary_type
df_result_selected_columns["Also present in"] = ", ".join(types)
df_result_selected_columns['BrokenCheck'] = report_values["broken_check"]
df_result_selected_columns['RedirectUrl'] = report_values["redirect_url"]
df_result_selected_columns['RedirectType'] = report_values["redirect_type"]
return df_result_selected_columns
else:
return pd.DataFrame()
def _index_journals(df):
jidx = {}
for index, row in df.iterrows():
for cell in row:
# FIXME: assumes each URL only appears once
if isinstance(cell, str) and cell.startswith("http"):
# make an index of the URL to the journal title, added date, updated date and journal id
jidx[cell] = (row[0], row[50], row[51], row[54], row[9], row[10])
return jidx
def check_links(df, journal_url_index):
"""
Retrieve the URLs from the csv file
:param df: DataFrame object of the csv file which is exported from link checker tool
:param journal_df: DataFrame object of the journals csv file generated by journal_urls.py script
:return: DataFrame object of the results
"""
results = []
# Iterate through the rows of the DataFrame
size = len(df)
for index, row in df.iterrows():
if row["BrokenCheck"] == "OK" and not isinstance(row["RedirectUrl"], str):
continue
if isinstance(row["RedirectUrl"], str) and row["RedirectUrl"].startswith("https://doaj.org"):
continue
log("checking row {x}/{y}: {a} {b}".format(x=index, y=size, a=row["BrokenCheck"], b=row["RedirectUrl"]))
values = {
'url': row["Url"],
'broken_check': row["BrokenCheck"],
'redirect_url': row["RedirectUrl"],
'redirect_type': row["RedirectType"]
}
result = fetch_matching_rows(journal_url_index, values)
if not result.empty:
results.append(result)
return pd.concat(results) if results else pd.DataFrame()
def generate_report(csv_files, journal_csv_file):
"""
Generate a report in a format that is useful to analyze from the csv file exported from link checker tool
:param csv_file: csv file exported from link checker tool
:param journal_csv_file: journal csv file generated by the journal_urls.py script
:return:
"""
journal_df = pd.read_csv(journal_csv_file)
log("Read journal file")
journal_url_index = _index_journals(journal_df)
log("Indexed journal urls")
master_df = pd.DataFrame(columns=['Journal title', 'Added on Date', 'Last updated Date', "Journal ID", "Publisher",
"Country of publisher",])
for csv_file in csv_files:
df = pd.read_csv(csv_file)
log("Checking file {x}".format(x=csv_file))
df = check_links(df, journal_url_index)
master_df = pd.concat([master_df, df])
log("All links checked")
write_results(master_df)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Add arguments
parser.add_argument('--file', help='Specify csv file location downloaded from link checker tool.')
parser.add_argument("--dir", help="Directory where dr link checker report files are found")
parser.add_argument("--prefix", help="Dr Link Checker file prefixes, if specifying the --dir option")
parser.add_argument('--journal_csv_file', help='Specify the journal csv file location generated by journal_urls.py'
' script')
# Parse command-line arguments
args = parser.parse_args()
log("start")
files = []
if args.dir and args.prefix:
options = os.listdir(args.dir)
for o in options:
if o.startswith(args.prefix):
files.append(os.path.join(args.dir, o))
else:
files.append(args.file)
log("Checking files: {x}".format(x=", ".join(files)))
generate_report(files, args.journal_csv_file)
log("end")