# Importing Libraries & Templates

In [4]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

import numpy
from sklearn import naive_bayes
from sklearn import metrics

sys.path.append('/gspan_mining')
from gspan_mining import gSpan
from gspan_mining import GraphDatabase

# Classes

In [6]:
class PatternGraphs:
	"""
	This template class is used to define a task for the gSpan implementation.
	You should not modify this class but extend it to define new tasks
	"""

	def __init__(self, database):
		# A list of subsets of graph identifiers.
		# Is used to specify different groups of graphs (classes and training/test sets).
		# The gid-subsets parameter in the pruning and store function will contain for each subset, all the occurrences
		# in which the examined pattern is present.
		self.gid_subsets = []
		self.database = database  # A graphdatabase instance: contains the data for the problem.

	def store(self, dfs_code, gid_subsets):
		"""
		Code to be executed to store the pattern, if desired.
		The function will only be called for patterns that have not been pruned.
		In correlated pattern mining, we may prune based on confidence, but then check further conditions before storing.
		:param dfs_code: the dfs code of the pattern (as a string).
		:param gid_subsets: the cover (set of graph ids in which the pattern is present) for each subset in self.gid_subsets
		"""
		print("Please implement the store function in a subclass for a specific mining task!")

	def prune(self, gid_subsets):
		"""
		prune function: used by the gSpan algorithm to know if a pattern (and its children in the search tree)
		should be pruned.
		:param gid_subsets: A list of the cover of the pattern for each subset.
		:return: true if the pattern should be pruned, false otherwise.
		"""
		print("Please implement the prune function in a subclass for a specific mining task!")
        
class FrequentPositiveGraphs(PatternGraphs):
	"""
	Finds the frequent (support >= minsup) subgraphs among the positive graphs.
	This class provides a method to build a feature matrix for each subset.
	"""

	def __init__(self, minsup, database, subsets):
		"""
		Initialize the task.
		:param minsup: the minimum positive support
		:param database: the graph database
		:param subsets: the subsets (train and/or test sets for positive and negative class) of graph ids.
		"""
		super().__init__(database)
		self.patterns = []  # The patterns found in the end (as dfs codes represented by strings) with their cover (as a list of graph ids).
		self.minsup = minsup
		self.gid_subsets = subsets

	# Stores any pattern found that has not been pruned
	def store(self, dfs_code, gid_subsets):
		self.patterns.append((dfs_code, gid_subsets))

	# Prunes any pattern that is not frequent in the positive class
	def prune(self, gid_subsets):
		# first subset is the set of positive ids
		return (len(gid_subsets[0]) + len(gid_subsets[1])) < self.minsup

	# creates a column for a feature matrix
	def create_fm_col(self, all_gids, subset_gids):
		subset_gids = set(subset_gids)
		bools = []
		for i, val in enumerate(all_gids):
			if val in subset_gids:
				bools.append(1)
			else:
				bools.append(0)
		return bools

	# return a feature matrix for each subset of examples, in which the columns correspond to patterns
	# and the rows to examples in the subset.
	def get_feature_matrices(self):
		matrices = [[] for _ in self.gid_subsets]
		for pattern, gid_subsets in self.patterns:
			for i, gid_subset in enumerate(gid_subsets):
				matrices[i].append(self.create_fm_col(self.gid_subsets[i], gid_subset))
		return [numpy.array(matrix).transpose() for matrix in matrices]


In [11]:
def task1(database_file_name_pos, database_file_name_neg, k, minsup):
    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(database_file_name_pos)  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(database_file_name_neg)  # Reading negative graphs, adding them to database and getting ids
   
    subsets = [pos_ids, neg_ids]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(minsup, graph_database, subsets)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their positive support:
    result =[]
    frequents = []

    for pattern, gid_subsets in task.patterns:
        pos_support = len(gid_subsets[0])
        neg_support = len(gid_subsets[1])
        confidence = pos_support / (pos_support + neg_support)
        frequents.append((confidence,pos_support + neg_support))
        result.append((pattern,confidence,pos_support + neg_support))


    uniq = list(set(freq for freq in frequents))
    s = sorted(uniq, key=lambda x:x[0], reverse=True)
    r = [s.index(freq) for freq in frequents]

    ranked = []
    for idx, i in enumerate(r):
        if i < k:
            ranked.append(result[idx])
            ranked.sort(key=lambda x:x[1], reverse=True)

    for a,b,c in ranked:
        print('{} {} {}'.format(a,b,c))

       

In [19]:
database_file_name_pos = "C:/Users/b_tib/coding/Msc/oLING2364/Assignements/data-mining-patterns/graph-mining-pattern/data/moleculesspos.txt"
database_file_name_neg = "C:/Users/b_tib/coding/Msc/oLING2364/Assignements/data-mining-patterns/graph-mining-pattern/data/moleculessneg.txt"

k = 5
minsup = 5
nfolds = 4


task1(database_file_name_pos, database_file_name_neg, k, minsup)
#task2(database_file_name_pos, database_file_name_neg, k, minsup, nfolds)
# task3(database_file_name_pos, database_file_name_neg, k, minsup, folds)
# task4(database_file_name_pos, database_file_name_neg, k, minsup, folds)

[(frm=0, to=1, vevlb=('0', '100', '0')),(frm=1, to=2, vevlb=(-1, '100', '0')),(frm=2, to=3, vevlb=(-1, '100', '0')),(frm=3, to=4, vevlb=(-1, '100', '0')),(frm=4, to=5, vevlb=(-1, '100', '0')),(frm=5, to=0, vevlb=(-1, '100', -1)),(frm=5, to=6, vevlb=(-1, '100', '0')),(frm=6, to=7, vevlb=(-1, '100', '0')),(frm=7, to=8, vevlb=(-1, '100', '0')),(frm=8, to=9, vevlb=(-1, '100', '0'))] 1.0 5
[(frm=0, to=1, vevlb=('0', '100', '0')),(frm=1, to=2, vevlb=(-1, '100', '0')),(frm=2, to=3, vevlb=(-1, '100', '0')),(frm=3, to=4, vevlb=(-1, '100', '0')),(frm=4, to=5, vevlb=(-1, '100', '0')),(frm=5, to=6, vevlb=(-1, '100', '0')),(frm=6, to=7, vevlb=(-1, '100', '0')),(frm=7, to=8, vevlb=(-1, '100', '0')),(frm=8, to=9, vevlb=(-1, '100', '0'))] 1.0 5
[(frm=0, to=1, vevlb=('0', '100', '0')),(frm=1, to=2, vevlb=(-1, '100', '0')),(frm=2, to=3, vevlb=(-1, '100', '0')),(frm=3, to=4, vevlb=(-1, '100', '0')),(frm=4, to=5, vevlb=(-1, '100', '0')),(frm=5, to=6, vevlb=(-1, '100', '0')),(frm=6, to=7, vevlb=(-1, '100',

In [30]:
list1=[[], [], [32, 33, 34, 35, 38, 25, 26, 28], [20, 21, 23]]
list2=[[5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0], [0, 1, 2, 3, 4], [25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0], [20, 21, 22, 23, 24]]

def remove(list1, list2):
    for elem in range(len(list1)):
        list_1, list_2 = list1[elem], list2[elem]
        for item in list_1:
            list_2.remove(item)
    return list2

new = remove(list1, list2)
new




[[5.0,
  6.0,
  7.0,
  8.0,
  9.0,
  10.0,
  11.0,
  12.0,
  13.0,
  14.0,
  15.0,
  16.0,
  17.0,
  18.0,
  19.0],
 [0, 1, 2, 3, 4],
 [27.0, 29.0, 30.0, 31.0, 36.0, 37.0, 39.0],
 [22, 24]]

In [92]:

def rmv_list1_from_list2(list1, list2):
    diff = []
    :
        temp = [[x for x in b if x not in a] for a, b in zip(list1, list2)]
        diff.append(temp)
    return diff


new = rmv_list1_from_list2(list1, list2)
new

[[5.0,
  6.0,
  7.0,
  8.0,
  9.0,
  10.0,
  11.0,
  12.0,
  13.0,
  14.0,
  15.0,
  16.0,
  17.0,
  18.0,
  19.0],
 [0, 1, 2, 3, 4],
 [27.0, 29.0, 30.0, 31.0, 36.0, 37.0, 39.0],
 [22, 24]]

In [93]:
temp = [[x for x in b if x not in a] for a, b in zip(list1, list2)]
temp

[[5.0,
  6.0,
  7.0,
  8.0,
  9.0,
  10.0,
  11.0,
  12.0,
  13.0,
  14.0,
  15.0,
  16.0,
  17.0,
  18.0,
  19.0],
 [0, 1, 2, 3, 4],
 [27.0, 29.0, 30.0, 31.0, 36.0, 37.0, 39.0],
 [22, 24]]