In [21]:
import pandas as pd
import numpy as np

# Task 1 - 2
For solving task 1 and 2 we will define function for lazy FCA classification. This function will use compute intersect computation function for getting intersect between row in dataset and row we want to classify. And function for computing extent base on intersect computed from previos function. Also we add function for computing intervals as argument with default value for future tasks 4 and 5. We also can set verbose to 1 for logging every step of classifier. Also we define function for computing accuracy.

In [22]:
def compute_itersect(data_row, classification_row, interval_constructor=None):
    """
    This func compute intersection between two given rows.

    Arguments:
        data_row (list) -- data row
        classification_row (list) -- row that we try to classify
        interval_constructor (method) -- method for computing interval (default None)

    Return: 
        intersection (np.array) -- array with computed intersaction that 
            can be used as pattern for filtering data 
    """
    intersection = []

    if len(data_row) - len(classification_row) != 1 and len(data_row) != len(classification_row):
        raise Exception

    for i in range(len(classification_row)):
        if type(classification_row[i]) is str:
            if data_row[i] != classification_row[i]:
                intersection.append('*')
            else:
                intersection.append(classification_row[i])
        else:
            if interval_constructor is None:
                intersection.append((min(data_row[i], classification_row[i]), max(data_row[i], classification_row[i])))
            else:
                intersection.append(interval_constructor(data_row[i], classification_row[i]))

    return np.array(intersection, dtype=object)

In [23]:
def compute_extent(data, pattern):
    """
    This func compute extent for pattern and data.
    Arguments:
        data (pandas.DataFrame) -- data frame with train data
        pattern (np.array) -- pattern for filtering rows

    Return: 
        extent (list) -- list with tuples, where first value is row index and second value is target value for extent
    """
    extent = []
    
    if len(data) == 0:
        return extent

    if data.shape[1] - pattern.shape[0] != 1 and data.shape[1] != pattern.shape[0]:
        raise Exception 

    for row_index in range(data.shape[0]):
        row = data.iloc[row_index]
        is_fit = True
        for i in range(len(pattern)):
            if type(row[i]) is str:
                if pattern[i] != '*' and row[i] != pattern[i]:
                    is_fit = False
                    break
            else:
                if row[i] < pattern[i][0] or row[i] > pattern[i][1]:
                    is_fit = False
                    break
        
        if is_fit:
            extent.append((row_index, row[-1]))

    return extent

In [24]:
def classificator(train, test, verbose=0, interval_constructor=None):
    """
    Lazy FCA classificator.
    Arguments:
        train (pandas.DataFrame) -- data frame with train data
        test (pandas.DataFrame) -- data frame with data fro classification
        verbose (int) -- value for controling logging, if 0 won't log anything otherwise print computation steps
        interval_constructor (method) -- method for computing interval (default None)
    Return: 
        answers (list) -- return classification results, if object can't be classified put U (Undefined) in list
    """
    answers = []
    for test_row in test.iloc:
        is_classified = False
        for train_row in train.iloc:
            pattern = compute_itersect(train_row.to_list(), test_row.to_list(), interval_constructor=interval_constructor)
            extent = compute_extent(train, pattern)
            if verbose:
                print(pattern)
                print(extent)
            if len(extent) != 0 and all(extent[0][1] == x[1] for x in extent):
                answers.append(extent[0][1])
                is_classified = True
                break
        if not is_classified:
            answers.append('U')
    return answers

In [25]:
def count_accuracy(predictions, answers):
    """Count accuracy for prediction and correct answers
    Arguments:
        predictions (list) -- labels that was predicted
        answers (list) -- write predictions
    Return: 
        accuracy (float) -- accuracy for giving data
    """
    
    if len(answers) != len(predictions):
        raise Exception
    
    count = 0
    for i in range(len(predictions)):
        if answers.iloc[i] == predictions[i]:
            count += 1
    
    return count / len(predictions)


# Task 1

In [26]:
data = pd.read_csv("data/Wheel_Chains_Dataset.csv", sep=';')
data.shape, data

((17, 8),
    System   mount  price  CON  SnOW  ice dur  dur Accegrade
 0      SK       F    206  1.9   1.4      1.8  2.7         F
 1     SRK  F or R    520  2.1   0.8      3.8  2.3         F
 2      SK       F    160  1.7   1.9      1.6  3.7         F
 3      SK       F    213  1.7   2.0      2.4  3.4         F
 4     SMS  F or R    598  1.6   2.4      2.7  2.8         F
 5      SK       F    109  2.0   1.9      2.4  3.7         F
 6     SRK  F or R    325  2.0   2.1      3.2  2.8         T
 7     SMS  F or R    498  1.5   3.3      3.5  2.0         T
 8     SRK  F or R    396  2.8   2.1      3.1  2.5         T
 9     SRK  F or R    325  2.2   2.2      4.6  3.2         T
 10    SRK  F or R    389  2.0   2.2      3.3  4.3         T
 11    SRK       F    298  2.5   2.3      3.3  2.8         T
 12     SK       F    149  1.9   2.5      4.0  3.8         T
 13    SMS  F or R    684  1.7   3.3      4.4  2.2         T
 14     SK       F     99  2.8   2.2      2.5  4.0         T
 15     SK    

In [27]:
train = data.iloc[:-3]
test = data.iloc[-3:].drop(columns=["Accegrade"])
test_y = data.iloc[-3:]["Accegrade"]
prediction = classificator(train, test, verbose=1)
print(prediction)
count_accuracy(prediction, test_y)

['SK' 'F' (99, 206) (1.9, 2.8) (1.4, 2.2) (1.8, 2.5) (2.7, 4.0)]
[(0, 'F'), (5, 'F')]
['SK' 'F' (140, 206) (1.9, 2.6) (1.4, 2.3) (1.8, 3.3) (2.7, 3.4)]
[(0, 'F')]
['SK' 'F' (206, 215) (1.9, 2.3) (1.4, 3.8) (1.8, 4.8) (2.3, 2.7)]
[(0, 'F')]
['F', 'F', 'F']


0.0

### Comments for results
Here we have very poor accuracy. In my opinion, this is because we have only one object in most of our extents, so we have to change our algorithm a bit, such as taking the average prediction instead of just the first one, or changing the function for calculating the intervals.

# Task 2

In [28]:
data = pd.read_csv("data/Wheel_Chains_Dataset_difforder.csv", sep=';')
data.shape, data

((17, 8),
    System   mount  price  CON  SnOW  ice dur  dur Accegrade
 0      SK       F    149  1.9   2.5      4.0  3.8         T
 1     SRK  F or R    520  2.1   0.8      3.8  2.3         F
 2     SRK  F or R    389  2.0   2.2      3.3  4.3         T
 3      SK       F    213  1.7   2.0      2.4  3.4         F
 4     SMS  F or R    598  1.6   2.4      2.7  2.8         F
 5      SK       F    109  2.0   1.9      2.4  3.7         F
 6     SRK  F or R    325  2.0   2.1      3.2  2.8         T
 7     SMS  F or R    498  1.5   3.3      3.5  2.0         T
 8     SRK  F or R    396  2.8   2.1      3.1  2.5         T
 9      SK       F    160  1.7   1.9      1.6  3.7         F
 10    SRK  F or R    389  2.0   2.2      3.3  4.3         T
 11    SRK       F    298  2.5   2.3      3.3  2.8         T
 12     SK       F    206  1.9   1.4      1.8  2.7         F
 13    SMS  F or R    684  1.7   3.3      4.4  2.2         T
 14     SK       F     99  2.8   2.2      2.5  4.0         T
 15     SK    

In [29]:
train = data.iloc[:-3]
test = data.iloc[-3:].drop(columns=["Accegrade"])
test_y = data.iloc[-3:]["Accegrade"]
prediction = classificator(train, test, verbose=1)
print(prediction)
count_accuracy(prediction, test_y)

['SK' 'F' (99, 149) (1.9, 2.8) (2.2, 2.5) (2.5, 4.0) (3.8, 4.0)]
[(0, 'T')]
['SK' 'F' (140, 149) (1.9, 2.6) (2.3, 2.5) (3.3, 4.0) (3.4, 3.8)]
[(0, 'T')]
['SK' 'F' (149, 215) (1.9, 2.3) (2.5, 3.8) (4.0, 4.8) (2.3, 3.8)]
[(0, 'T')]
['T', 'T', 'T']


1.0

### Comments for results
Here we have too good acuracy to be true. We just lucky becouse changing order of dataset help us with getting right results. For getting correct accuracy we should use cross validation.

# Task 3
Here we use cross validation with 6 folds for computing appropriate accuracy.

In [30]:
def cross_validation(data, folds=5, verbose=0, interval_constructor=None):
    """
    Cross validation for counting Lazy FCA classifier avarage accuracy.
    Arguments:
        data (pandas.DataFrame) -- data frame with data for counting avarage accuracy
        folds (int) -- number of folds for cross validation
        verbose (int) -- value for controling logging, if 0 won't log anything otherwise print computation steps
        interval_constructor (method) -- method for computing interval (default None)
    Return: 
        avarage accuracy (float) -- return avarage accuracy for all folds
    """
    accuracys = np.array([])
    fold_size = int(np.ceil(data.shape[0] / folds))
    for i in range(0, data.shape[0], fold_size):
        train = pd.concat([data.iloc[: i], data.iloc[i + fold_size:]])
        test = data.iloc[i : i + fold_size].drop(columns=["Accegrade"])
        test_y = data.iloc[i : i + fold_size]["Accegrade"]
        if verbose:
            print(f"Fold {i // fold_size + 1}")
        prediction = classificator(train, test, verbose=verbose, interval_constructor=interval_constructor)
        accuracy = count_accuracy(prediction, test_y)
        accuracys = np.append(accuracys, accuracy)
        if verbose:
            print("Accuracy for fold %d is %.2f \n"%(i // fold_size + 1, accuracy))

    return np.average(accuracys)

In [31]:
print("Cross validation accuracy %.2f"%(cross_validation(data, folds=6, verbose=1) * 100) + '%')

Fold 1
['SK' 'F' (149, 213) (1.7, 1.9) (2.0, 2.5) (2.4, 4.0) (3.4, 3.8)]
[(0, 'F')]
['*' '*' (213, 520) (1.7, 2.1) (0.8, 2.0) (2.4, 3.8) (2.3, 3.4)]
[(0, 'F')]
['*' '*' (213, 389) (1.7, 2.0) (2.0, 2.2) (2.4, 3.3) (3.4, 4.3)]
[(0, 'F'), (7, 'T')]
['*' 'F or R' (389, 598) (1.6, 2.0) (2.2, 2.4) (2.7, 3.3) (2.8, 4.3)]
[(1, 'F'), (7, 'T')]
['*' '*' (109, 389) (2.0, 2.0) (1.9, 2.2) (2.4, 3.3) (3.7, 4.3)]
[(2, 'F'), (7, 'T')]
['SRK' 'F or R' (325, 389) (2.0, 2.0) (2.1, 2.2) (3.2, 3.3) (2.8, 4.3)]
[(3, 'T'), (7, 'T')]
Accuracy for fold 1 is 0.67 

Fold 2
['SK' 'F' (149, 213) (1.7, 1.9) (2.0, 2.5) (2.4, 4.0) (3.4, 3.8)]
[(0, 'T')]
['*' '*' (149, 598) (1.6, 1.9) (2.4, 2.5) (2.7, 4.0) (2.8, 3.8)]
[(0, 'T')]
['SK' 'F' (109, 149) (1.9, 2.0) (1.9, 2.5) (2.4, 4.0) (3.7, 3.8)]
[(0, 'T')]
Accuracy for fold 2 is 0.00 

Fold 3
['*' '*' (149, 325) (1.9, 2.0) (2.1, 2.5) (3.2, 4.0) (2.8, 3.8)]
[(0, 'T')]
['*' '*' (149, 498) (1.5, 1.9) (2.5, 3.3) (3.5, 4.0) (2.0, 3.8)]
[(0, 'T')]
['*' '*' (149, 396) (1.9, 2.

# Task 4
Here we define new function for computing intervals and try it with object number 15. It looks like better method, but let's try cross validation for appropriate check.

In [32]:
def min_inf_interval(a_1, a_2):
    return (min(a_1, a_2), float('inf'))

In [33]:
train = pd.concat([data.iloc[:14], data.iloc[15:]])
test = data.iloc[14:15].drop(columns=["Accegrade"])
test_y = data.iloc[14:15]["Accegrade"]
prediction = classificator(train, test, verbose=1, interval_constructor=min_inf_interval)
print(prediction)

['SK' 'F' (99, inf) (1.9, inf) (2.2, inf) (2.5, inf) (3.8, inf)]
[(0, 'T')]
['T']


In [34]:
print("Cross validation accuracy %.2f"%(cross_validation(data, folds=6, verbose=0, interval_constructor=min_inf_interval) * 100) + '%')

Cross validation accuracy 66.67%


# Task 5
Another function for tring with object 15. It looks like worth method, but let's try cross validation for appropriate check.

In [35]:
def max_inf_interval(a_1, a_2):
    return (max(a_1, a_2), float('inf'))

In [36]:
train = pd.concat([data.iloc[:14], data.iloc[15:]])
test = data.iloc[14:15].drop(columns=["Accegrade"])
test_y = data.iloc[14:15]["Accegrade"]
prediction = classificator(train, test, verbose=1, interval_constructor=max_inf_interval)
print(prediction)

['SK' 'F' (149, inf) (2.8, inf) (2.5, inf) (4.0, inf) (4.0, inf)]
[]
['*' '*' (520, inf) (2.8, inf) (2.2, inf) (3.8, inf) (4.0, inf)]
[]
['*' '*' (389, inf) (2.8, inf) (2.2, inf) (3.3, inf) (4.3, inf)]
[]
['SK' 'F' (213, inf) (2.8, inf) (2.2, inf) (2.5, inf) (4.0, inf)]
[]
['*' '*' (598, inf) (2.8, inf) (2.4, inf) (2.7, inf) (4.0, inf)]
[]
['SK' 'F' (109, inf) (2.8, inf) (2.2, inf) (2.5, inf) (4.0, inf)]
[]
['*' '*' (325, inf) (2.8, inf) (2.2, inf) (3.2, inf) (4.0, inf)]
[]
['*' '*' (498, inf) (2.8, inf) (3.3, inf) (3.5, inf) (4.0, inf)]
[]
['*' '*' (396, inf) (2.8, inf) (2.2, inf) (3.1, inf) (4.0, inf)]
[]
['SK' 'F' (160, inf) (2.8, inf) (2.2, inf) (2.5, inf) (4.0, inf)]
[]
['*' '*' (389, inf) (2.8, inf) (2.2, inf) (3.3, inf) (4.3, inf)]
[]
['*' 'F' (298, inf) (2.8, inf) (2.3, inf) (3.3, inf) (4.0, inf)]
[]
['SK' 'F' (206, inf) (2.8, inf) (2.2, inf) (2.5, inf) (4.0, inf)]
[]
['*' '*' (684, inf) (2.8, inf) (3.3, inf) (4.4, inf) (4.0, inf)]
[]
['SK' 'F' (140, inf) (2.8, inf) (2.3, inf) 

In [37]:
print("Cross validation accuracy %.2f"%(cross_validation(data, folds=6, verbose=0, interval_constructor=max_inf_interval) * 100) + '%')

Cross validation accuracy 22.22%


### Comments for results

We have tried two different methods for computing intervals. I think that first method is more appropriate for our dataset, because second one is too strickt and objects usually get undefined tag. Same results show cross validation. Max inf interval get lower accuracy then min inf. However both methods give low accuracy, so actually we should better try other approaches for tunning our model.