# LOG6302A - Lab4

CESAR Pierre 2310298

MERSALI Bilal 2309771


### Imports


In [4]:
#change execuction directory for the google colab use
import os
from code_analysis import CFG, CFGReader


# Partie 1: Extraction du DataFlow

In [5]:
class CFGVisitor:
	def __init__(self, cfg: CFG):
		self.cfg = cfg
		self.defs = self.all_defs()
		self.refs = self.all_refs()
		# print("self.defs:",self.defs)
		# print("self.refs:",self.refs)


	def all_defs(self) -> set:
		defs = set()
		for node in self.cfg.get_node_ids():
			if self.cfg.get_type(node) == "BinOP" and self.cfg.get_image(node) == "=":
				left_op_node = self.cfg.get_op_hands(node)[0]
				if self.cfg.get_type(left_op_node) == "Variable":
					defs.add(left_op_node)
		return defs

	def all_refs(self) -> set:
		refs = set()
		for node in self.cfg.get_node_ids():
			if self.cfg.get_type(node) == "Variable" and not(node in self.defs):
				refs.add(node)
		return refs

	def def_in_node(self, node:int) -> int:
		if node in self.defs:
			return node
			# return self.defs[node] # Left op node

	def reaching_defs(self):
		IN = {}
		OUT = {}
		GEN = {}
		KILL = {}

		for node in self.cfg.get_node_ids():
			IN[node] = set()
			OUT[node] = set()
			KILL[node] = set()
			left_op_node = self.def_in_node(node)
			if left_op_node is not None:
				GEN[node] = {left_op_node}
				for node2 in self.cfg.get_node_ids():
					left_op_node2 = self.def_in_node(node2)
					if self.cfg.get_image(left_op_node2) == self.cfg.get_image(left_op_node):
						KILL[node].add(node2)
			else:
				GEN[node] = set()

		changes = True
		while changes:
			changes = False
			for node in self.cfg.get_node_ids():
				for parent in self.cfg.get_parents(node):
					IN[node] = IN[node].union(OUT[parent])
				call_begin = self.cfg.get_call_begin(node)
				if call_begin != None:
					IN[node] = IN[node].union(OUT[call_begin])
				old_OUT = OUT[node]
				OUT[node] = GEN[node].union(IN[node].difference(KILL[node]))
				if OUT[node] != old_OUT:
					changes = True

		return IN, OUT

	def ref_in_node(self, node:int) -> set:
		if node in self.refs:
			return node

	def reachable_refs(self):
		IN = {}
		OUT = {}
		GEN = {}
		KILL = {}

		for node in self.cfg.get_node_ids():
			IN[node] = set()
			OUT[node] = set()
			GEN[node] = set()
			KILL[node] = set()
			possible_ref = self.ref_in_node(node)
			if possible_ref is not None: # if node is Variable reference
				GEN[node] = {node}

			possible_def = self.def_in_node(node)
			if possible_def is not None: # if node is Variable definition
				for node2 in self.cfg.get_node_ids():
					if self.ref_in_node(node2) is not None and self.cfg.get_image(node2) == self.cfg.get_image(possible_def):
						KILL[node].add(node2)

		changes = True
		while changes:
			changes = False
			for node in self.cfg.get_node_ids():
				for child in self.cfg.get_children(node):
					OUT[node] = OUT[node].union(IN[child])
				call_end = self.cfg.get_call_end(node)
				if call_end != None:
					OUT[node] = OUT[node].union(IN[call_end])
				old_IN = IN[node]
				IN[node] = GEN[node].union(OUT[node].difference(KILL[node]))
				if IN[node] != old_IN:
					changes = True

		return IN, OUT

	def get_var(self) -> set:
		var_set = set()
		for node in self.cfg.get_node_ids():
			if self.cfg.get_type(node) == "Variable":
				var_set.add(self.cfg.get_image(node))
		return var_set

	def get_nodes_number(self) -> int:
		return len(self.cfg.get_node_ids())

	def get_edges_number(self) -> int:
		edges_number = 0
		for node in self.cfg.get_node_ids():
			edges_number += len(self.cfg.get_children(node))
		return edges_number

	def find_pair_ref_def(self) -> set:
		IN_def, OUT_def = self.reaching_defs()
		pairs_ref_def = set()
		for ref in self.refs:
			var = self.cfg.get_image(ref)
			associated_defs = set()
			for definition in OUT_def[ref]:
				if self.cfg.get_image(definition) == var:
					associated_defs.add(definition)
			for associated_def in associated_defs:
				# print(f"La référence au noeud {ref} de la variable {var} provient possiblement de la définition au noeud {associated_def}")
				pairs_ref_def.add((ref,associated_def))
		return pairs_ref_def

	def find_pair_def_ref(self) -> set:
		IN_ref, OUT_ref = self.reachable_refs()
		pairs_def_ref = set()
		for definition in self.defs:
			var = self.cfg.get_image(definition)
			associated_refs = set()
			for ref in OUT_ref[definition]:
				if self.cfg.get_image(ref) == var:
					associated_refs.add(ref)
			for associated_ref in associated_refs:
				# print(f"La définition au noeud {definition} de la variable {var} est possiblement utilisé au noeud {associated_ref}")
				pairs_def_ref.add((definition,associated_ref))
		return pairs_def_ref

	def find_ref_not_defined(self) -> set:
		IN_def, OUT_def = self.reaching_defs()
		ref_not_defined = set()
		for ref in self.refs:
			defined = False
			var = self.cfg.get_image(ref)
			for definition in IN_def[ref]:
				if self.cfg.get_image(definition) == var:
					defined = True
					break
			if not defined:
				ref_not_defined.add(ref)
		return ref_not_defined

	def find_def_not_referenced(self) -> set:
		IN_ref, OUT_ref = self.reachable_refs()
		def_not_referenced = set()
		for definition in self.defs:
			referenced = False
			var = self.cfg.get_image(definition)
			for ref in OUT_ref[definition]:
				if self.cfg.get_image(ref) == var:
					referenced = True
					break
			if not referenced:
				def_not_referenced.add(definition)
		return def_not_referenced

	def find_prepare_query_nodes(self):
		prepare_query_nodes = []
		for node in self.cfg.get_node_ids():
			if self.cfg.get_type(node) == "CallBegin" and self.cfg.get_image(node) == "prepare_query":
				prepare_query_nodes.append(node)
		return prepare_query_nodes

	def assert_def_comes_from_filter_var(self, node):
		#1 assert node type is Variable
		if self.cfg.get_type(node) != "Variable":
			return False
		#retrive the binOp child
		binOp_child = self.cfg.get_children(node)[0]
		#2 assert node type is BinOP
		if self.cfg.get_type(binOp_child) != "BinOP":
			return False
		#retrive the 2nc part of the binOp
		node_to_look = self.cfg.get_op_hands(binOp_child)[1]
		seek = True
		while seek:
			#3 assert node type is BinOP
			if self.cfg.get_image(node_to_look) == "filter_var":
				seek = False
				return True
			elif self.cfg.get_type(node_to_look) == "RetValue":
				node_to_look = self.cfg.get_parents(node_to_look)[0]
			else:
				return False
	
	def check_prepare_query_params_filtered(self):
		prepare_query_nodes = self.find_prepare_query_nodes()
		for node in prepare_query_nodes:
			params = self.cfg.get_call_args(node)
			#filter params with filter_var
			params = [param for param in params if self.cfg.get_type(param) == "Variable"]
			for param in params:
				filtered = True
				defs_nodes_in,_ = self.reaching_defs()

				defs_nodes = [n for n in defs_nodes_in[param] if self.cfg.get_image(n) == self.cfg.get_image(param)]
				for def_node in defs_nodes:
			
					if not self.assert_def_comes_from_filter_var(def_node):
						filtered = False
				if not filtered:
					print(f"Parameter {self.cfg.get_image(param)} of prepare_query at line {self.cfg.get_position(node)[0]} is not filtered or not filtered every time by filter_var")


# Partie 2 : Utilisation du Dataflow

## 2.1 Variable vive/morte


In [3]:
def find_ref_not_defined(cfg_visitor) -> set:
  IN_def, OUT_def = cfg_visitor.reaching_defs()
  ref_not_defined = set()
  for ref in cfg_visitor.refs:
    defined = False
    var = cfg_visitor.cfg.get_image(ref)
    for definition in IN_def[ref]:
      if cfg_visitor.cfg.get_image(definition) == var:
        defined = True
        break
    if not defined:
      ref_not_defined.add(ref)
  return ref_not_defined

def find_def_not_referenced(cfg_visitor) -> set:
  IN_ref, OUT_ref = cfg_visitor.reachable_refs()
  def_not_referenced = set()
  for definition in cfg_visitor.defs:
    referenced = False
    var = cfg_visitor.cfg.get_image(definition)
    for ref in OUT_ref[definition]:
      if cfg_visitor.cfg.get_image(ref) == var:
        referenced = True
        break
    if not referenced:
      def_not_referenced.add(definition)
  return def_not_referenced

In [4]:
def visit_one_file(filename: str):
	reader = CFGReader()
	cfg = reader.read_cfg(filename)
	visitor = CFGVisitor(cfg)
	return visitor

def part2(directory: str) -> None:
	for filename in os.scandir(directory):
		if filename.is_file() and filename.name.endswith(".php.cfg.json"):
			visitor = visit_one_file(filename.path)
			print("For file:",filename.path)
			ref_not_defined = visitor.find_ref_not_defined()
			def_not_referenced = visitor.find_def_not_referenced()
			print("\tRéférences non définies:")#,ref_not_defined)
			for ref in ref_not_defined:
				print(f"\t\tVariable {visitor.cfg.get_image(ref)}, ligne {visitor.cfg.get_position(ref)[0]}")
			print("\tDéfinitions non référencées/utilisées:")#,def_not_referenced)
			for definition in def_not_referenced:
				print(f"\t\tVariable {visitor.cfg.get_image(definition)}, ligne {visitor.cfg.get_position(definition)[0]}")
			print("\n")

part2("../tp4/part_2/")

For file: ../tp4/part_2/file3.php.cfg.json
	Références non définies:
		Variable this, ligne 1917
		Variable this, ligne 1917
		Variable this, ligne 2620
		Variable value, ligne 3293
		Variable field, ligne 2620
		Variable value, ligne 3293
		Variable this, ligne 2621
		Variable field, ligne 2621
		Variable string, ligne 1239
		Variable data, ligne 2624
		Variable field, ligne 2624
		Variable string, ligne 1253
		Variable data, ligne 2627
		Variable value, ligne 3295
		Variable data, ligne 2640
		Variable table, ligne 2640
		Variable string, ligne 1254
		Variable data, ligne 2641
		Variable field, ligne 2641
		Variable this, ligne 1930
		Variable value, ligne 2641
		Variable value, ligne 3296
		Variable this, ligne 1258
		Variable value, ligne 2642
		Variable show_errors, ligne 65
		Variable this, ligne 1259
		Variable this, ligne 1933
		Variable suppress_errors, ligne 74
		Variable value, ligne 3296
		Variable value, ligne 2642
		Variable last_error, ligne 83
		Variable value, ligne 26

Nos résultats comportent un certains nombre d'anomalies:

*   Paramètres de fonction: En considérant uniquement les définitions de la forme (Variable = Expression ∧ Literal), les paramètres d'une fonction sont considérés comme des références non définies (on le voit facilement dans l'analyse du fichier `part_2/file1.php` avec la variable `filename`). Cela entraîne des faux positifs dans les références non définies.
*   Attribut de classe: Avec notre implémentation, nous ne considérons pas les attributs de classe (exemple: `$this->ready`) comme une seule variable (car il y a 3 noeuds en vérité). Cela entraîne des bugs, avec l'exemple précédent `this` et `ready` seront considérées comme 2 variables indépendantes et donc ne seront pas correctement comptabilisées parmi les références et les définitions.
*   Élément d'un tableau: Pour les même raisons que les attributs de classe, les éléments de tableau (exemple: `$value['charset']`) ne sont pas correctement comptabilisés. Mais ce cas est encore plus complexe car il est aussi possible de faire référence à l'élément d'un tableau sans l'appeler par son index ou sa clé (exemple: `foreach ( $data as $col => $value )`)
*   Variable à l'intérieur d'une chaîne de caractères: Certaines variables sont utilisés dans des chaîne de caractères (exemple ligne 2523 du fichier `part_2/file3.php`: ```"DELETE FROM `$table` WHERE $conditions"```) et ces variables ne sont pas associées à un noeud `Variable` donc ces références ne sont pas détectées. Cela entraîne des faux positifs dans les définitions non référencées.


Mais l'algorithme détecte tout de même des définitions non utilisés parmi les fichiers du répertoire `part_2/` (liste non exhaustive):
*   `fp` dans `part_2/file1.php`
*   `querry` dans `part_2/file3.php` (ligne 3464)

## 2.2 Filtration des données utilisateur

In [9]:
def analyze_part3(directory):
    for filename in os.scandir(directory):
        if filename.is_file() and filename.name.endswith(".php.cfg.json"):
            reader = CFGReader()
            cfg = reader.read_cfg(filename.path)
            visitor = CFGVisitor(cfg)
            visitor.check_prepare_query_params_filtered()

analyze_part3("../tp4/part_3/")