Skip to content

Commit

Permalink
Merge pull request #36 from OussamaBeng/fix-versions-for-unmodified-f…
Browse files Browse the repository at this point in the history
…iles

Fix the problem of versions for unmodified files
  • Loading branch information
bretfourbe committed Apr 10, 2024
2 parents a71817b + 22d5631 commit 4ced8a2
Show file tree
Hide file tree
Showing 11 changed files with 145 additions and 356 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ dist/*

# do not remove the dist folder
!dist/.keep

.idea/
build/*
2 changes: 1 addition & 1 deletion hashtheplanet/config/extensions_list.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""
This file contains a Regular Expression of files to exclude, allowing HashThePlanet to take only interesting files
"""
EXCLUDED_FILE_PATTERN = '(test.|.*\\.(php|jsp|asp|aspx|db|yml|sh|sql|pl)$)'
EXCLUDED_FILE_PATTERN = '(test.|.*\\.(php|jsp|asp|aspx|db|yml|sh|sql|pl|mysql|inc|conf|class|module)$)'
5 changes: 2 additions & 3 deletions hashtheplanet/core/hashtheplanet.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,7 @@ def analyze_file(self, file_path: str) -> Tuple[str, dict]:
"""
Analyze a file and returns its technology and its versions
"""
file_hash = Hash.hash_file(file_path)

file_hash = Hash.calculate_git_hash(file_path)
if file_hash is None:
return (None, None)
return self.analyze_hash(file_hash)
Expand Down Expand Up @@ -207,7 +206,7 @@ def main():
hashtheplanet = HashThePlanet(args.output, args.input)

if args.file is not None:
readable_hash = Hash.hash_file(args.file)
readable_hash = Hash.calculate_git_hash(args.file)
if readable_hash is None:
return
hashtheplanet.find_hash(readable_hash)
Expand Down
127 changes: 20 additions & 107 deletions hashtheplanet/resources/git_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,17 @@
import re
import subprocess
import tempfile
from stat import S_ISDIR, S_ISREG
from stat import S_ISDIR
from typing import List, Tuple

# third party imports
from git import GitCommandError, Repo
from git.diff import Diff, DiffIndex
from git.objects.commit import Commit
from git.refs.tag import Tag
from loguru import logger

# project imports
from hashtheplanet.sql.db_connector import Hash, Version as VersionTable
from hashtheplanet.sql.db_connector import Hash
from hashtheplanet.resources.resource import Resource
from hashtheplanet.config.extensions_list import EXCLUDED_FILE_PATTERN

Expand Down Expand Up @@ -91,71 +90,29 @@ def _hash_files(
os.chdir(current_dir)
return files_info

@staticmethod
def _get_changes_between_two_tags(tag_a: Tag, tag_b: Tag) -> List[GitFileMetadata]:
"""
This method fetches all changes (modification & creation) between two tags,
and returns a list of changes containing:
- the file path
- the associated tag name
- the associated blob hash
"""
files: List[GitFileMetadata] = []

commit_a = tag_a.commit
commit_b = tag_b.commit
commit_diff: DiffIndex = commit_a.diff(commit_b)

for diff in commit_diff:
diff: Diff = diff
if diff.a_blob and S_ISREG(diff.a_blob.mode):
match_ext = re.search(EXCLUDED_FILE_PATTERN, diff.a_blob.path)
if not match_ext:
files.append((diff.a_blob.path, tag_a.name, diff.a_blob.hexsha))
elif diff.b_blob and S_ISREG(diff.b_blob.mode):
match_ext = re.search(EXCLUDED_FILE_PATTERN, diff.b_blob.path)
if not match_ext:
files.append((diff.b_blob.path, tag_b.name, diff.b_blob.hexsha))
return files

def _get_diff_files(self, tags: List[Tag]) -> List[GitFileMetadata]:
"""
This method retrieves all changes between a list of tags and returns them.
"""
logger.info("Retrieving diff of all tags ...")
def _get_blob_hashes(self, tags : List[Tag]) -> List[FileMetadata]:

files: List[GitFileMetadata] = []

# This line makes couples with the n + 1 element. Example: (A,B), (B,C), ...
for (tag_a, tag_b) in zip(tags[:-1], tags[1:]):
files += self._get_changes_between_two_tags(tag_a, tag_b)
return files
for tag in tags:
tag_name = tag.name
commit = tag.commit

def _get_tag_files(self, tag: Tag) -> List[GitFileMetadata]:
"""
This method retrieves all files with their tag name and their blob hash in a tag.
"""
files: List[GitFileMetadata] = []
for item in commit.tree.traverse():
if item.type == 'blob':
file_path = item.path
file_hash = item.hexsha
match_ext = re.search(EXCLUDED_FILE_PATTERN, file_path)
if not match_ext:
files.append((file_path, tag_name, file_hash))

for (file_path, blob_hash) in self.get_all_files_from_commit(tag.commit):
match_ext = re.search(EXCLUDED_FILE_PATTERN, file_path)
if not match_ext:
files.append((file_path, tag.name, blob_hash))
return files

@staticmethod
def _get_diff_versions(first_version: str, last_version: str, tags: List[Tag]) -> List[str]:
"""
This method retrieves all tags between two tags.
"""
tag_names = list(map(lambda tag: tag.name, tags))
return tag_names[tag_names.index(first_version):tag_names.index(last_version)]

def _save_hashes(
self,
session_scope,
files_info: List[FileMetadata],
tags: List[Tag],
technology: str
):
"""
Expand All @@ -164,50 +121,17 @@ def _save_hashes(
with session_scope() as session:
file_record = {}

self._database.insert_versions(session, technology, tags)
for (file_path, tag_name, file_hash) in files_info:
(last_version, last_hash) = file_record.get(file_path) or (None, None)

self._database.insert_file(session, technology, file_path)

if last_version is not None:
# We retrieve all the versions between the last version of the file and this one
# and then we add them to the last hash
versions = self._get_diff_versions(last_version, tag_name, tags)
self._database.insert_or_update_hash(session, last_hash, technology, versions)

self._database.insert_or_update_hash(session, file_hash, technology, [tag_name])
self._database.insert_or_update_hash(session, file_path, file_hash, technology, [tag_name])
file_record[file_path] = (tag_name, file_hash)

@staticmethod
def _filter_stored_tags(stored_versions: List[VersionTable], found_tags: List[Tag]) -> List[Tag]:
"""
This function will compare the stored tags (the tags in the htp database)
and the tags found in the git repository, then after it keeps only the non stored tags.
"""
result = []

if len(stored_versions) == len(found_tags):
return []
for found_tag_idx, found_tag in enumerate(found_tags):
last_found_tag_idx = found_tag_idx - 1

if found_tag_idx >= len(stored_versions) or found_tag.name != stored_versions[found_tag_idx]:

# this verification permits to know if it's the first to be added,
# and if it's the case, then we add the one before to permits to make a diff
if last_found_tag_idx >= 0 and not result:
result.append(found_tags[last_found_tag_idx])
result.append(found_tag)
return result

def compute_hashes(self, session_scope, target: str):
"""
This method clones the repository from url, retrieves tags, compares each tags to retrieve only modified files,
computes their hashes and then stores the tags & files information in the database.
This method clones the repository from url, retrieves tags, retrieve the hashes, and then stores the tags
& files information in the database.
"""
technology = target.split('.git')[0].split('/')[-1]
tags: List[Tag] = []
files: List[GitFileMetadata] = []

with tempfile.TemporaryDirectory() as tmp_dir_name:
Expand All @@ -220,21 +144,10 @@ def compute_hashes(self, session_scope, target: str):
logger.info("Retrieving tags ...")
tags = repo.tags.copy()

with session_scope() as session:
stored_tags = self._database.get_versions(session, technology)

if not stored_tags:
logger.info("Retrieving files from the first tag ...")
files += self._get_tag_files(tags[0])

logger.info("Filtering the tags ...")
tags = self._filter_stored_tags(stored_tags, tags)

logger.info("Retrieving only modified files between the tags ...")
files += self._get_diff_files(tags)
logger.info("Retrieving the hashes from the Git repository...")
files += self._get_blob_hashes(tags)

logger.info("Generating hashes ...")
files_info = self._hash_files(files, tmp_dir_name)
logger.info("== DONE ! ==")

logger.info("Saving hashes ...")
self._save_hashes(session_scope, files_info, tags, technology)
self._save_hashes(session_scope, files, technology)
2 changes: 1 addition & 1 deletion hashtheplanet/resources/npm_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def _save_hashes(
match_ext = re.search(EXCLUDED_FILE_PATTERN, file_path)
if not match_ext:
self._database.insert_file(session, npm_module_name, file_path)
self._database.insert_or_update_hash(session, file_hash, npm_module_name, [version])
self._database.insert_or_update_hash(session, file_path, file_hash, npm_module_name, [version])

def compute_hashes(self, session_scope, target: str):
"""
Expand Down
47 changes: 44 additions & 3 deletions hashtheplanet/sql/db_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
"""
# standard imports
import hashlib
import os
from json import JSONEncoder, loads
from typing import List
from git import Repo

# third party imports
from loguru import logger
from sqlalchemy import JSON, Column, Text, select, update
from sqlalchemy import JSON, Column, Text, select, update, and_
from sqlalchemy.ext.declarative import declarative_base, declared_attr
from sqlalchemy.sql.sqltypes import Integer

Expand Down Expand Up @@ -52,6 +54,7 @@ def __tablename__(cls): # pylint: disable=no-self-argument
return cls.__name__.lower()

hash = Column(Text, nullable=False, primary_key=True)
file = Column(Text, nullable=False)
technology = Column(Text, nullable=False)
versions = Column(JSON, nullable=False)

Expand All @@ -69,6 +72,21 @@ def hash_file(file_path: str) -> str:
logger.error(f"Error with file {file_path} : {error}")
return None

@staticmethod
def calculate_git_hash(file_path: str) -> str:
"""
This method computes the Git SHA1 hash of the provided file and returns it.
"""
repo_path = os.path.dirname(os.path.abspath(file_path))
repo = Repo.init(repo_path) # Initialize a Git repository object
try:
with open(file_path, "rb"):
blob_hash = repo.git.hash_object(file_path) # Calculate the hash
return blob_hash
except OSError as error:
logger.error(f"Error with file {file_path} : {error}")
return None

@staticmethod
def hash_bytes(data: bytes) -> str:
"""
Expand Down Expand Up @@ -131,7 +149,7 @@ def insert_file(session, technology, path):
logger.debug(f"Entry {entry} already exists in files database")

@staticmethod
def insert_or_update_hash(session, hash_value: str, technology: str, versions: List[str]):
def insert_or_update_hash(session,file_name: str, hash_value: str, technology: str, versions: List[str]):
"""
Insert a new hash related to technology and version in hash table if it does not exist yet.
If it already exists, update related versions.
Expand All @@ -140,13 +158,14 @@ def insert_or_update_hash(session, hash_value: str, technology: str, versions: L
entry = session.execute(stmt).scalar_one_or_none()

if not entry:
new_hash = Hash(hash=hash_value, technology=technology, versions=JSONEncoder() \
new_hash = Hash(file = file_name, hash=hash_value, technology=technology, versions=JSONEncoder() \
.encode({"versions": versions}))
session.add(new_hash)
logger.debug(f"Entry {new_hash} added to hash database")
else:
existing_versions: List[str] = loads(entry.versions)["versions"]


for version in versions:
if version not in existing_versions:
existing_versions.append(version)
Expand All @@ -156,6 +175,28 @@ def insert_or_update_hash(session, hash_value: str, technology: str, versions: L
session.execute(stmt)
logger.debug(f"Entry {entry} updated with new versions {versions}")

@staticmethod
def insert_version_existing_files(session, file_name: str, old_version: str, new_version: str):
"""
Insert a new hash related to technology and version in hash table if it does not exist yet.
If it already exists, update related versions.
"""

stmt = select(Hash).filter(Hash.file ==file_name, Hash.versions.contains(str(old_version)))

entries = session.execute(stmt).scalars().all()
if entries:
for entry in entries:
existing_versions: List[str] = loads(entry.versions)["versions"]

if old_version in existing_versions:
existing_versions.append(new_version)
stmt = update(Hash).where(and_(Hash.file == file_name, Hash.versions.contains(old_version))) \
.values(versions=JSONEncoder().encode({"versions": existing_versions})) \
.execution_options(synchronize_session="fetch")
session.execute(stmt)
logger.debug(f"Entry {entry} updated with new versions {existing_versions}")

@staticmethod
def get_all_hashs(session):
"""
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ idna>=2.5,<4
urllib3>=1.21.1,<1.27
certifi>=2017.4.17
beautifulsoup4>=4.10.0
soupsieve>1.2
soupsieve>1.2
typing-extensions>=4.6.3
Loading

0 comments on commit 4ced8a2

Please sign in to comment.