**The Relational Model in Python**

Copyright Jens Dittrich & Marcel Maltry, [Big Data Analytics Group](https://bigdata.uni-saarland.de/), [CC-BY-SA](https://creativecommons.org/licenses/by-sa/4.0/legalcode)

In [1]:
from ra.csv_utils import load_csv
from ra.relation import Relation

In [2]:
foo = Relation('foo', [('id', int), ('name', str)])
foo.add_tuple( (2,'Hello') )
foo.add_tuple( (7,'World') )
foo.add_tuple( (1,'!') )

foo.print_table()

---
[1mfoo[0m 
--------------
[1mid     name   [0m 
--------------
2      Hello  
7      World  
1      !      



**Again:** Neither the order of rows nor the order of columns carry any meaning in a relation!

### IMDB

In [3]:
from os import listdir

# Data source: https://relational.fit.cvut.cz/dataset/IMDb
# Information courtesy of IMDb (http://www.imdb.com). Used with permission.
#
# Notice: The data can only be used for personal and non-commercial use and must not
# be altered/republished/resold/repurposed to create any kind of online/offline
# database of movie information (except for individual personal use).

path = 'data/IMDb_sample'  
# create a list of all files in that directory that end with "*.csv":
files = [file for file in listdir(path) if file.endswith('.csv')]
files

['movies_directors.csv',
 'actors.csv',
 'directors.csv',
 'movies_genres.csv',
 'directors_genres.csv',
 'movies.csv',
 'roles.csv']

In [4]:
# load all relations:
relations = list()
for file in files:
    print("Reading {} ...".format(file))  # print currently parsed file
    filepath = path + '/' + file  # preappend file name by path
    name = file[:-4]  # removes .csv file ending and takes filename as relation name
    relation = load_csv(filepath, name, delimiter='\t')
    relations.append(relation)

Reading movies_directors.csv ...
Reading actors.csv ...
Reading directors.csv ...
Reading movies_genres.csv ...
Reading directors_genres.csv ...
Reading movies.csv ...
Reading roles.csv ...


In [5]:
# display all relations:
maxRowsLimit = 10
for rel in relations:
    rel.print_set(maxRowsLimit)

[movies_directors] : {[ director_id:int, movie_id:int ]}
{
	(11652, 322652),
	(43095, 30431),
	(78273, 276217),
	(11652, 256530),
	(43095, 92616),
	(43095, 1711),
	(43095, 176891),
	(43095, 110246),
	(43095, 177019),
	(78273, 223710)
}
[actors] : {[ id:int, first_name:str, last_name:str, gender:str ]}
{
	(342443, Sean, Nepita, M),
	(809699, Louise J., Taylor, F),
	(65536, Steve, Buscemi, M),
	(590629, Jamie Lee, Curtis, F),
	(199295, Harry, Harvey Jr., M),
	(110901, Jonathan, Daw, M),
	(484826, Vern, Urich, M),
	(635230, Valeria, Golino, F),
	(840317, Yoshiko, Yamaguchi, F),
	(362418, Erik, Parillo, M)
}
[directors] : {[ id:int, first_name:str, last_name:str ]}
{
	(11652, James (I), Cameron),
	(78273, Quentin, Tarantino),
	(43095, Stanley, Kubrick)
}
[movies_genres] : {[ movie_id:int, genre:str ]}
{
	(176711, Thriller),
	(106666, Drama),
	(5306, Thriller),
	(10920, Thriller),
	(310455, War),
	(159665, War),
	(164572, Thriller),
	(176891, Film-Noir),
	(328277, Sci-Fi),
	(176712, Drama)


In [6]:
# define a readable identifier:
directors = relations[0]

**Two different displays of the same relation (model <--> view):**

In [7]:
# 1. as a set:
maxRowsLimit = 10
directors.print_set(maxRowsLimit)

[movies_directors] : {[ director_id:int, movie_id:int ]}
{
	(11652, 322652),
	(43095, 30431),
	(78273, 276217),
	(11652, 256530),
	(43095, 92616),
	(43095, 1711),
	(43095, 176891),
	(43095, 110246),
	(43095, 177019),
	(78273, 223710)
}


In [8]:
# 2. tabular:
directors.print_table(10)

----------------
[1mmovies_directors[0m 
--------------------------
[1mdirector_id  movie_id     [0m 
--------------------------
11652        322652       
43095        30431        
78273        276217       
11652        256530       
43095        92616        
43095        1711         
43095        176891       
43095        110246       
43095        177019       
78273        223710       



# Exercise

Extend class Relation to support keys and check for duplicates of keys when adding tuples:

In [9]:
# upload the contents of this cell to our CMS as a text file

# a relation subclass respecting key constraints:
class KeyRelation(Relation):
    # keys: names of the key attributes as a list
    def __init__(self, name, schema, keys):
        super().__init__(name, schema)
        
        # assert that the list of keys is subset-equal self-attributes:
        assert set(keys) <= set(self.attributes)
        # make sure that at least one key attribute is defined:
        assert len(keys) >= 1
        
        # add your code here!
        # ...
        # initialize data structures that are required
        # to check the key constraint for new tuples
        pass
        
    def add_tuple(self, tup):
        # add your code here!
        # ...
        # check if there is a tuple with the same key in the relation
        # only insert it using super().add_tuple(tup) if there is not.
        # raise a ValueError if the key is already present.
        # Make sure to perform your check in O(1) time!
        pass
        
    def print_schema(self):
        super().print_schema()
        # add your code here!
        # ...
        # should also print the key attributes
        pass

### Unit Test for Relation

Note that test cases are by no means exhaustive!

In [10]:
import unittest

class RelationTest(unittest.TestCase):

    def setUp(self):
        self.foo = Relation('foo', [('id', int), ('name', str)])
        self.foo.add_tuple( (2,'Hello') )
        self.foo.add_tuple( (7,'World') )
        self.foo.add_tuple( (1,'!') )

        self.bar = Relation('bar', [('a', int), ('b', int), ('c', int), ('d', int)])
        self.bar.add_tuple( (1, 2, 3, 4) )
        self.bar.add_tuple( (2, 2, 3, 4) )
        self.bar.add_tuple( (3, 2, 3, 4) )
        self.bar.add_tuple( (4, 2, 3, 4) )
        self.bar.add_tuple( (5, 2, 3, 4) )
        
    def test_size(self):
        # foo should contain 3 tuples
        self.assertEqual(len(self.foo), 3)
        # check valid insert
        self.assertTrue(self.foo.add_tuple( (3, '?') ))
        self.assertEqual(len(self.foo), 4)
        # check duplicate insert
        self.assertFalse(self.foo.add_tuple( (1,'!') ))
        self.assertEqual(len(self.foo), 4)
        
        # bar should contain 5 tuples
        self.assertEqual(len(self.bar), 5)
        # check valid insert
        self.assertTrue(self.bar.add_tuple( (6, 2, 3, 4) ))
        self.assertEqual(len(self.bar), 6)
        # check duplicate insert
        self.assertFalse(self.bar.add_tuple( (5, 2, 3, 4) ))
        self.assertEqual(len(self.bar), 6)
    
    def test_schema(self):
        # incorrectly typed tuple
        with self.assertRaises(AssertionError):
            self.foo.add_tuple( ('wrong order', 42) )
        with self.assertRaises(AssertionError):
            self.foo.add_tuple( (0.1, 'wrong type') )
        # inccorectly sized tuples
        with self.assertRaises(AssertionError):
            self.foo.add_tuple( (6, 'wrong size', 12) )
        with self.assertRaises(AssertionError):
            self.foo.add_tuple( (42,) )
        
        # incorrectly typed tuple
        with self.assertRaises(AssertionError):
            self.bar.add_tuple( (0.1, 0.2, 0.3, 0.4) )
        with self.assertRaises(AssertionError):
            self.bar.add_tuple( ('1', '3', '2', '4') )
        # incorrectly sized
        with self.assertRaises(AssertionError):
            self.bar.add_tuple( (1, 2, 4, 5, 6) )
        with self.assertRaises(AssertionError):
            self.bar.add_tuple( (1, 2, 4) )

### Unit Test for KeyRelation

Note that test cases are by no means exhaustive!

In [11]:
class KeyRelationTest(unittest.TestCase):
    
    def setUp(self):
        keys = ['id']
        self.foo = KeyRelation('foo', [('id', int), ('name', str)], keys)
        self.foo.add_tuple( (1, 'first') )
        self.foo.add_tuple( (2, 'second') )
        self.foo.add_tuple( (3, 'thrid') )
        
        keys = ['a', 'c']
        self.bar = KeyRelation('bar', [('a', int), ('b', int), ('c', int), ('d', int)], keys)
        self.bar.add_tuple( (1, 2, 1, 3) )
        self.bar.add_tuple( (1, 3, 2, 1) )
        self.bar.add_tuple( (2, 3, 2, 1) )
        self.bar.add_tuple( (2, 3, 1, 2) )
        
    def test_size(self):
        # foo should contain 3 tuples
        self.assertEqual(len(self.foo), 3)
        # check valid insert
        self.foo.add_tuple( (4, 'fourth') )
        self.assertEqual(len(self.foo), 4)
        # check duplicate key insert
        with self.assertRaises(ValueError):
            self.foo.add_tuple( (1, 'one') ) # should raise ValueError  
        self.assertEqual(len(self.foo), 4)  # should not add tuple
        # check duplicate tuple insert
        with self.assertRaises(ValueError):
            self.foo.add_tuple( (1,'first') )  #should raise ValueError
        self.assertEqual(len(self.foo), 4)  # should not add tuple
        
        # bar should contain 4 tuples
        self.assertEqual(len(self.bar), 4)
        # check valid insert
        self.bar.add_tuple( (3, 1, 2, 3) )
        self.assertEqual(len(self.bar), 5)
        # check duplicate key insert
        with self.assertRaises(ValueError):
            self.bar.add_tuple( (1, 3, 1, 2) )  # should raise ValueError
        self.assertEqual(len(self.bar), 5)  # should not add tuple
        # check duplicate tuple insert
        with self.assertRaises(ValueError):
            self.bar.add_tuple( (2, 3, 1, 2) )  # should raise ValueError
        self.assertEqual(len(self.bar), 5)  # should not add tuple
    
    def test_schema(self):
        # incorrectly typed tuple
        with self.assertRaises(AssertionError):
            self.foo.add_tuple( ('seventh', 7) )
        with self.assertRaises(AssertionError):
            self.foo.add_tuple( (0.1, 'zero point first') )
        # inccorectly sized tuples
        with self.assertRaises(AssertionError):
            self.foo.add_tuple( (42, 'oops', 12) )
        with self.assertRaises(AssertionError):
            self.foo.add_tuple( (43,) )
        
        # incorrectly typed tuple
        with self.assertRaises(AssertionError):
            self.bar.add_tuple( (0.1, 0.2, 0.3, 0.4) )
        with self.assertRaises(AssertionError):
            self.bar.add_tuple( ('1', '3', '2', '4') )
        # incorrectly sized
        with self.assertRaises(AssertionError):
            self.bar.add_tuple( (1, 2, 4, 5, 6) )
        with self.assertRaises(AssertionError):
            self.bar.add_tuple( (1, 2, 4) )

In [12]:
# Run the unit test without shutting down the jupyter kernel
unittest.main(argv=['ignored', '-v'], verbosity=2, exit=False)

test_schema (__main__.KeyRelationTest) ... FAIL
test_size (__main__.KeyRelationTest) ... FAIL
test_schema (__main__.RelationTest) ... ok
test_size (__main__.RelationTest) ... ok

FAIL: test_schema (__main__.KeyRelationTest)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-11-fc8a6f390d61>", line 49, in test_schema
    self.foo.add_tuple( ('seventh', 7) )
AssertionError: AssertionError not raised

FAIL: test_size (__main__.KeyRelationTest)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-11-fc8a6f390d61>", line 19, in test_size
    self.assertEqual(len(self.foo), 3)
AssertionError: 0 != 3

----------------------------------------------------------------------
Ran 4 tests in 0.005s

FAILED (failures=2)


<unittest.main.TestProgram at 0x10aaabc88>