From cc4a6fc68394c357db589388e50b6a2a7e1433d5 Mon Sep 17 00:00:00 2001 From: Claudio Moretti Date: Wed, 15 Jan 2014 20:49:01 +0000 Subject: [PATCH 1/7] Added ruleset merger script Merger script added. TODO: fix it to work with the correct tree --- utils/merger.py | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100755 utils/merger.py diff --git a/utils/merger.py b/utils/merger.py new file mode 100755 index 000000000000..57f8e4ad0075 --- /dev/null +++ b/utils/merger.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3.3 + +# Copyright 2014 Claudio Moretti +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# +# You NEED: 'top-1m.csv' and 'newRules.diff' in the same directory as merger.py +# git diff --name-status master..remotes/origin/stable src/chrome/content/rules >> newRules.diff +# + +import csv +import xml.etree.ElementTree as etree + +# Variables and constants +sitesList = [] + +# Functions +def ruleLookup(target): + try: # list.index(value) throus an exception for a "not found", so if it throws it, it's not found + sitesList.index(target) + return 1 + except: + return 0 + +# Handles reading the Alexa Top 1M and pushing all sites in a list +sitesReader = csv.reader(open('top-1m.csv'), delimiter=',', quotechar='"') +for row in sitesReader: + try: + # Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part + siteFQDN = sitesList.append(row[1].split("/",1)[0]) + + except csv.Error as e: + sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e)) + +# TODO: Somebody needs to write a function that generates a diff from the STABLE and UNSTABLE branch +# I'll go manually with `git diff --name-status master..remotes/origin/stable src/chrome/content/rules` and call the file "newRules.diff" +rulesList = open('newRules.diff', 'r') +for line in rulesList: + try: + # Split into "file mode in commit + file path" + ruleFile = line.split() + found = 0 + # If file mode is "A" (add) + if ruleFile[0] == "A": #If file was "added", parse + ruleText = etree.parse(ruleFile[1]) + for target in ruleText.findall('target'): + FQDN = target.get('host') # URL of the website + if ruleLookup(FQDN) == 1: # Look it up in the sitesList + found = 1 + break + # If found, print it + if found == 1: + print("FOUND: ", ruleFile[1]) + # else ignore + # There are some problems with file name encoding. So, for now, just print an error and pass + except FileNotFoundError: # Won't happen before line.split() is invoked + print("File not found:", ruleFile[1]) + pass + + From 77ae03ec875005e8c8a1e7753a407dc1c9350749 Mon Sep 17 00:00:00 2001 From: Claudio Moretti Date: Wed, 15 Jan 2014 20:57:42 +0000 Subject: [PATCH 2/7] Changed name to make it clearer --- utils/merger.py | 63 ------------------------------------------------- 1 file changed, 63 deletions(-) delete mode 100755 utils/merger.py diff --git a/utils/merger.py b/utils/merger.py deleted file mode 100755 index 57f8e4ad0075..000000000000 --- a/utils/merger.py +++ /dev/null @@ -1,63 +0,0 @@ -#! /usr/bin/env python3.3 - -# Copyright 2014 Claudio Moretti -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# -# You NEED: 'top-1m.csv' and 'newRules.diff' in the same directory as merger.py -# git diff --name-status master..remotes/origin/stable src/chrome/content/rules >> newRules.diff -# - -import csv -import xml.etree.ElementTree as etree - -# Variables and constants -sitesList = [] - -# Functions -def ruleLookup(target): - try: # list.index(value) throus an exception for a "not found", so if it throws it, it's not found - sitesList.index(target) - return 1 - except: - return 0 - -# Handles reading the Alexa Top 1M and pushing all sites in a list -sitesReader = csv.reader(open('top-1m.csv'), delimiter=',', quotechar='"') -for row in sitesReader: - try: - # Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part - siteFQDN = sitesList.append(row[1].split("/",1)[0]) - - except csv.Error as e: - sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e)) - -# TODO: Somebody needs to write a function that generates a diff from the STABLE and UNSTABLE branch -# I'll go manually with `git diff --name-status master..remotes/origin/stable src/chrome/content/rules` and call the file "newRules.diff" -rulesList = open('newRules.diff', 'r') -for line in rulesList: - try: - # Split into "file mode in commit + file path" - ruleFile = line.split() - found = 0 - # If file mode is "A" (add) - if ruleFile[0] == "A": #If file was "added", parse - ruleText = etree.parse(ruleFile[1]) - for target in ruleText.findall('target'): - FQDN = target.get('host') # URL of the website - if ruleLookup(FQDN) == 1: # Look it up in the sitesList - found = 1 - break - # If found, print it - if found == 1: - print("FOUND: ", ruleFile[1]) - # else ignore - # There are some problems with file name encoding. So, for now, just print an error and pass - except FileNotFoundError: # Won't happen before line.split() is invoked - print("File not found:", ruleFile[1]) - pass - - From 14fb7690d0dc78cc087055543d942a9a5300657d Mon Sep 17 00:00:00 2001 From: Claudio Moretti Date: Wed, 15 Jan 2014 22:14:16 +0000 Subject: [PATCH 3/7] Fixup commit For some reason, the file was not committed correctly. Re-doing the commit- --- utils/alexa-ruleset-checker.py | 63 ++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100755 utils/alexa-ruleset-checker.py diff --git a/utils/alexa-ruleset-checker.py b/utils/alexa-ruleset-checker.py new file mode 100755 index 000000000000..57f8e4ad0075 --- /dev/null +++ b/utils/alexa-ruleset-checker.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3.3 + +# Copyright 2014 Claudio Moretti +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# +# You NEED: 'top-1m.csv' and 'newRules.diff' in the same directory as merger.py +# git diff --name-status master..remotes/origin/stable src/chrome/content/rules >> newRules.diff +# + +import csv +import xml.etree.ElementTree as etree + +# Variables and constants +sitesList = [] + +# Functions +def ruleLookup(target): + try: # list.index(value) throus an exception for a "not found", so if it throws it, it's not found + sitesList.index(target) + return 1 + except: + return 0 + +# Handles reading the Alexa Top 1M and pushing all sites in a list +sitesReader = csv.reader(open('top-1m.csv'), delimiter=',', quotechar='"') +for row in sitesReader: + try: + # Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part + siteFQDN = sitesList.append(row[1].split("/",1)[0]) + + except csv.Error as e: + sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e)) + +# TODO: Somebody needs to write a function that generates a diff from the STABLE and UNSTABLE branch +# I'll go manually with `git diff --name-status master..remotes/origin/stable src/chrome/content/rules` and call the file "newRules.diff" +rulesList = open('newRules.diff', 'r') +for line in rulesList: + try: + # Split into "file mode in commit + file path" + ruleFile = line.split() + found = 0 + # If file mode is "A" (add) + if ruleFile[0] == "A": #If file was "added", parse + ruleText = etree.parse(ruleFile[1]) + for target in ruleText.findall('target'): + FQDN = target.get('host') # URL of the website + if ruleLookup(FQDN) == 1: # Look it up in the sitesList + found = 1 + break + # If found, print it + if found == 1: + print("FOUND: ", ruleFile[1]) + # else ignore + # There are some problems with file name encoding. So, for now, just print an error and pass + except FileNotFoundError: # Won't happen before line.split() is invoked + print("File not found:", ruleFile[1]) + pass + + From 7936fcdc1b59cd9800a7552e149f3adc8fcacc9c Mon Sep 17 00:00:00 2001 From: Claudio Moretti Date: Sat, 18 Jan 2014 01:32:59 +0000 Subject: [PATCH 4/7] Added 'git diff' generation Used pyhton subprocess.call() to automatically generate the git diff and save it in /tmp/ with a random filename --- utils/alexa-ruleset-checker.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/utils/alexa-ruleset-checker.py b/utils/alexa-ruleset-checker.py index 57f8e4ad0075..df58197cf1df 100755 --- a/utils/alexa-ruleset-checker.py +++ b/utils/alexa-ruleset-checker.py @@ -13,9 +13,12 @@ import csv import xml.etree.ElementTree as etree +import subprocess +import random # Variables and constants sitesList = [] +tmpRulesFileName = "/tmp/rulesDiff-" + format(random.randrange(1,65535)) # Feel free to enlarge if needed # Functions def ruleLookup(target): @@ -25,6 +28,9 @@ def ruleLookup(target): except: return 0 +# Fetch the Alexa Top 1M + + # Handles reading the Alexa Top 1M and pushing all sites in a list sitesReader = csv.reader(open('top-1m.csv'), delimiter=',', quotechar='"') for row in sitesReader: @@ -35,16 +41,22 @@ def ruleLookup(target): except csv.Error as e: sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e)) -# TODO: Somebody needs to write a function that generates a diff from the STABLE and UNSTABLE branch -# I'll go manually with `git diff --name-status master..remotes/origin/stable src/chrome/content/rules` and call the file "newRules.diff" -rulesList = open('newRules.diff', 'r') +# `git diff` the master revision against stable, rules folder only +try: + tmpRulesFile = open(tmpRulesFileName,"w") + subprocess.call(['git', 'diff', '--name-status', 'master..remotes/origin/stable', '../src/chrome/content/rules'], stdout=tmpRulesFile) + tmpRulesFile.close() +except OSError as e: + sys.exit('An OSError exception was raised: %s' % (e)) + +rulesList = open(tmpRulesFileName, 'r') for line in rulesList: try: # Split into "file mode in commit + file path" ruleFile = line.split() found = 0 # If file mode is "A" (add) - if ruleFile[0] == "A": #If file was "added", parse + if ruleFile[0] == "A": # If file was "added", parse ruleText = etree.parse(ruleFile[1]) for target in ruleText.findall('target'): FQDN = target.get('host') # URL of the website @@ -60,4 +72,5 @@ def ruleLookup(target): print("File not found:", ruleFile[1]) pass - +# Close the rules file +rulesList.close() From cf80a67410035030228e3bf56b376888b3492c55 Mon Sep 17 00:00:00 2001 From: Claudio Moretti Date: Sat, 18 Jan 2014 17:16:10 +0000 Subject: [PATCH 5/7] Alexa ruleset checker working The ruleset checker seems to be working: it downloads and unzips the Alexa Top1M and automatically generates the git diff. Comparing the two seems to be working as well. Manually checked some rules: they seem to have been correctly identified as in the Top 1M and not in stable --- utils/alexa-ruleset-checker.py | 88 ++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 10 deletions(-) diff --git a/utils/alexa-ruleset-checker.py b/utils/alexa-ruleset-checker.py index df58197cf1df..26e959e4f673 100755 --- a/utils/alexa-ruleset-checker.py +++ b/utils/alexa-ruleset-checker.py @@ -1,25 +1,61 @@ #! /usr/bin/env python3.3 -# Copyright 2014 Claudio Moretti +# Copyright 2014 Claudio Moretti # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # -# You NEED: 'top-1m.csv' and 'newRules.diff' in the same directory as merger.py -# git diff --name-status master..remotes/origin/stable src/chrome/content/rules >> newRules.diff +# This little piece of software works by downloading the Alexa Top 1M website list, which freely available, +# then it uses `git diff` to generate a list of XML ruleset files that are in the master branch but not in stable. +# Finally, it compares the two and prints the file name and path of every ruleset file that +# a) is in master but not in stable and +# b) has a target in the Alexa Top1M list # +import sys import csv import xml.etree.ElementTree as etree import subprocess import random +import urllib.request +import urllib.error +import zipfile +import os +import time # Variables and constants sitesList = [] + +# Temporary file containing the `git diff` between master and stable tmpRulesFileName = "/tmp/rulesDiff-" + format(random.randrange(1,65535)) # Feel free to enlarge if needed +# URL of the Alexa Top1M +alexaTop1MURL = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" +# alexaTop1MURL = "http://127.0.0.1/top-1m.csv.zip" + +# Temporary file name, to aboid conflicts +tmpAlexaFileName = "/tmp/alexa-top1M-" + format(random.randrange(1,65535)) + ".csv" + +# Logfile. Records the same output as the script (FOUND and "File not found" messages) +logFileName = "/tmp/alexa-ruleset-log-" + format(random.randrange(1,65535)) + ".log" + +# Filename of the CSV file contained in the Alexa zipfile +tmpAlexaZipFileContents = 'top-1m.csv' + +# Absolute path of the git repo (the folder containing src/) +# Remember to change this accordingly to your system, if you ever move the script +# +# By default, it refers to the parent directory of the one containing the script +# because the script was put in utils/ +# +# __NEEDS A TRAILING SLASH__ +# +# gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir)) +gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir)) + "/" + + # Functions def ruleLookup(target): try: # list.index(value) throus an exception for a "not found", so if it throws it, it's not found @@ -28,28 +64,55 @@ def ruleLookup(target): except: return 0 -# Fetch the Alexa Top 1M +# Fetch the Alexa Top 1M - http://stackoverflow.com/questions/1517616/stream-large-binary-files-with-urllib2-to-file +try: + print("Retrieving Alexa Top1M from", alexaTop1MURL) + tmpAlexaZipFileName, headers = urllib.request.urlretrieve(alexaTop1MURL) + print("File downloaded and stored in %s" % tmpAlexaZipFileName) +except urllib.error.URLError as e: + print("Failed to download Alexa Top 1M") + sys.exit('Error message: %s' % e) + +# Now unzip it +try: + # Extract in /tmp/ + print("Start extracting %s" % tmpAlexaZipFileName) + tmpAlexaZipFile = zipfile.ZipFile(tmpAlexaZipFileName,'r') + tmpAlexaZipFile.extractall('/tmp/') +except zipfile.BadZipfile: + sys.exit("The zip file %s is corrupted.",tmpAlexaZipFileName) +try: + # Rename the file to match the file with the random in it + os.rename('/tmp/' + tmpAlexaZipFileContents,tmpAlexaFileName) + print("Alexa Top1M retrieved and stored in %s" % tmpAlexaFileName) +except OSError as e: + print("Failed to rename /tmp/top-1M.csv to %s." % (tmpAlexaFileName)) + sys.exit('Error message: %s' % (e)) # Handles reading the Alexa Top 1M and pushing all sites in a list -sitesReader = csv.reader(open('top-1m.csv'), delimiter=',', quotechar='"') +sitesReader = csv.reader(open(tmpAlexaFileName), delimiter=',', quotechar='"') for row in sitesReader: try: # Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part siteFQDN = sitesList.append(row[1].split("/",1)[0]) - except csv.Error as e: - sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e)) + sys.exit('file %s, line %d: %s' % (tmpAlexaFileName, sitesReader.line_num, e)) # `git diff` the master revision against stable, rules folder only try: + print("Create git diff between master and stable in %s" % tmpRulesFileName) tmpRulesFile = open(tmpRulesFileName,"w") - subprocess.call(['git', 'diff', '--name-status', 'master..remotes/origin/stable', '../src/chrome/content/rules'], stdout=tmpRulesFile) + #subprocess.call(['git', 'diff', '--name-status', 'master..remotes/origin/stable', '../src/chrome/content/rules'], stdout=tmpRulesFile) + subprocess.call(['git', 'diff', '--name-status', 'remotes/origin/stable..master', '../src/chrome/content/rules'], stdout=tmpRulesFile) tmpRulesFile.close() except OSError as e: sys.exit('An OSError exception was raised: %s' % (e)) rulesList = open(tmpRulesFileName, 'r') +logFile = open(logFileName,'w') +logFile.write("Log file generated on %s.\nPaths are relative to the root directory of the git repo.\n\n" % time.strftime("%Y-%m-%d %H:%M:%S")) + for line in rulesList: try: # Split into "file mode in commit + file path" @@ -57,7 +120,7 @@ def ruleLookup(target): found = 0 # If file mode is "A" (add) if ruleFile[0] == "A": # If file was "added", parse - ruleText = etree.parse(ruleFile[1]) + ruleText = etree.parse(gitRepositoryPath + ruleFile[1]) # ADJUST FILE PATH (here is '../' IF YOU MOVE THE SCRIPT for target in ruleText.findall('target'): FQDN = target.get('host') # URL of the website if ruleLookup(FQDN) == 1: # Look it up in the sitesList @@ -66,11 +129,16 @@ def ruleLookup(target): # If found, print it if found == 1: print("FOUND: ", ruleFile[1]) + logFile.write("FOUND: %s\n" % ruleFile[1]) # else ignore # There are some problems with file name encoding. So, for now, just print an error and pass - except FileNotFoundError: # Won't happen before line.split() is invoked + except FileNotFoundError as e: # Won't happen before line.split() is invoked print("File not found:", ruleFile[1]) +# logFile.write ("File not found: %s\n" % ruleFile[1]) + logFile.write("%s\n" % e) pass # Close the rules file rulesList.close() +# And the log file +logFile.close() From 6f56e58ef392f25fc5f38806fabfef6e84833cb0 Mon Sep 17 00:00:00 2001 From: Claudio Moretti Date: Sat, 1 Feb 2014 14:54:54 +0000 Subject: [PATCH 6/7] Added rule limit and recognition of edited rules Rule limit was implemented using csvReader.max_lines and breaking the loop when it's hit Implemented tecognition of edited rules via the "M" flag of git diff. Output was tidied up a bit to account for the different wording (used tabulation) --- utils/alexa-ruleset-checker.py | 46 ++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/utils/alexa-ruleset-checker.py b/utils/alexa-ruleset-checker.py index 26e959e4f673..022df708ef0b 100755 --- a/utils/alexa-ruleset-checker.py +++ b/utils/alexa-ruleset-checker.py @@ -38,7 +38,7 @@ # Temporary file name, to aboid conflicts tmpAlexaFileName = "/tmp/alexa-top1M-" + format(random.randrange(1,65535)) + ".csv" -# Logfile. Records the same output as the script (FOUND and "File not found" messages) +# Logfile. Records the same output as the script logFileName = "/tmp/alexa-ruleset-log-" + format(random.randrange(1,65535)) + ".log" # Filename of the CSV file contained in the Alexa zipfile @@ -55,6 +55,9 @@ # gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir)) gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir)) + "/" +# Maximum number of websites to use in the Alexa Top 1M (i.e. it's no longer 1M but maxSitesNumber) +# Set to -1 for 'unlimited' +maxSitesNumber = 1000 # Functions def ruleLookup(target): @@ -96,6 +99,9 @@ def ruleLookup(target): try: # Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part siteFQDN = sitesList.append(row[1].split("/",1)[0]) + # print("Line %s: %s" % (sitesReader.line_num, sitesList[len(sitesList) - 1])) # Outputs the current line + if sitesReader.line_num == maxSitesNumber: + break except csv.Error as e: sys.exit('file %s, line %d: %s' % (tmpAlexaFileName, sitesReader.line_num, e)) @@ -113,23 +119,38 @@ def ruleLookup(target): logFile = open(logFileName,'w') logFile.write("Log file generated on %s.\nPaths are relative to the root directory of the git repo.\n\n" % time.strftime("%Y-%m-%d %H:%M:%S")) +# Let's keep track of how many rules were added and how many were modified +# Must be declared here or won't be available at the end of the loop +countAddedRules = 0 +countEditedRules = 0 + +# Start parsing the list for line in rulesList: try: # Split into "file mode in commit + file path" ruleFile = line.split() found = 0 - # If file mode is "A" (add) - if ruleFile[0] == "A": # If file was "added", parse - ruleText = etree.parse(gitRepositoryPath + ruleFile[1]) # ADJUST FILE PATH (here is '../' IF YOU MOVE THE SCRIPT + # If file mode is "A" (add) or "M" (edited) + if ruleFile[0] == "A" or ruleFile[0] == "M": # If file was added or edited between stable and master, parse + ruleText = etree.parse(gitRepositoryPath + ruleFile[1]) # ADJUST FILE PATH (here is '../') IF YOU MOVE THE SCRIPT - XXX: Obsolete warning? for target in ruleText.findall('target'): FQDN = target.get('host') # URL of the website if ruleLookup(FQDN) == 1: # Look it up in the sitesList - found = 1 - break - # If found, print it - if found == 1: - print("FOUND: ", ruleFile[1]) - logFile.write("FOUND: %s\n" % ruleFile[1]) + # Message different according to file mode + if ruleFile[0] == "A": # New + found = "NEW" + countAddedRules = countAddedRules + 1 + break + elif ruleFile[0] == "M": # Edited + found = "EDITED" + countEditedRules = countEditedRules + 1 + break + + # If found, print it TABULATED + if found != 0: + print("%s:\t%s" % (found, ruleFile[1])) + logFile.write("%s:\t%s" % (found, ruleFile[1])) + # else ignore # There are some problems with file name encoding. So, for now, just print an error and pass except FileNotFoundError as e: # Won't happen before line.split() is invoked @@ -138,6 +159,11 @@ def ruleLookup(target): logFile.write("%s\n" % e) pass +# Print our simple statistics +print("\n\nStatistics:\nParsed rules: %s\nNewly added rules: %s\nEdited rules: %d" % (maxSitesNumber, countAddedRules, countEditedRules)) +logFile.write("\n\nStatistics:\nParsed rules: %s\nNewly added rules: %s\nEdited rules: %d" % (maxSitesNumber, countAddedRules, countEditedRules)) +print("\n\nLog file can be found at %s" % logFileName) + # Close the rules file rulesList.close() # And the log file From 421be4be46e040201c09f378579d72778ee947e2 Mon Sep 17 00:00:00 2001 From: Claudio Moretti Date: Tue, 11 Feb 2014 23:33:59 +0000 Subject: [PATCH 7/7] Edited to raise IOError for Python < 3.3.4 In Python < 3.3.4 the FileNotFound exception is not present. As some filenames have weird encodings, and it's too problematic to fix the code to consider those, the FileNotFound error approach has been mirrored to work with Python < 3.3.4 --- utils/alexa-ruleset-checker.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/utils/alexa-ruleset-checker.py b/utils/alexa-ruleset-checker.py index 022df708ef0b..de5374b1aa0b 100755 --- a/utils/alexa-ruleset-checker.py +++ b/utils/alexa-ruleset-checker.py @@ -32,8 +32,8 @@ tmpRulesFileName = "/tmp/rulesDiff-" + format(random.randrange(1,65535)) # Feel free to enlarge if needed # URL of the Alexa Top1M -alexaTop1MURL = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" -# alexaTop1MURL = "http://127.0.0.1/top-1m.csv.zip" +# alexaTop1MURL = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" +alexaTop1MURL = "http://127.0.0.1/top-1m.csv.zip" # Temporary file name, to aboid conflicts tmpAlexaFileName = "/tmp/alexa-top1M-" + format(random.randrange(1,65535)) + ".csv" @@ -132,7 +132,8 @@ def ruleLookup(target): found = 0 # If file mode is "A" (add) or "M" (edited) if ruleFile[0] == "A" or ruleFile[0] == "M": # If file was added or edited between stable and master, parse - ruleText = etree.parse(gitRepositoryPath + ruleFile[1]) # ADJUST FILE PATH (here is '../') IF YOU MOVE THE SCRIPT - XXX: Obsolete warning? + ruleFileObject= open(gitRepositoryPath + ruleFile[1]) + ruleText = etree.parse(ruleFileObject) # ADJUST FILE PATH (here is '../') IF YOU MOVE THE SCRIPT - XXX: Obsolete warning? for target in ruleText.findall('target'): FQDN = target.get('host') # URL of the website if ruleLookup(FQDN) == 1: # Look it up in the sitesList @@ -158,6 +159,13 @@ def ruleLookup(target): # logFile.write ("File not found: %s\n" % ruleFile[1]) logFile.write("%s\n" % e) pass + except IOError as ioe: #Treated same as FileNotFoundError + print("File not found:", ruleFile[1]) +# logFile.write ("File not found: %s\n" % ruleFile[1]) + logFile.write("%s\n" % e) + pass + + # Print our simple statistics print("\n\nStatistics:\nParsed rules: %s\nNewly added rules: %s\nEdited rules: %d" % (maxSitesNumber, countAddedRules, countEditedRules))