# Spans
> Span class and how to interface it with pandas

In [None]:
#| default_exp span

In [None]:
#| hide
from nbdev.showdoc import show_doc

%load_ext autoreload
%autoreload 2

In [None]:
#| export
from abc import ABC, abstractmethod
import pytest

import pandas as pd
from pathlib import Path
from typing import no_type_check, Set, Sequence, Any,Optional,List,Callable,Dict,Union
from pydantic import BaseModel


I followed [this guide](https://itnext.io/guide-to-pandas-extension-types-and-how-to-create-your-own-3b213d689c86) on how to make extension types for pandas

In [None]:
#| export
import hashlib

def small_hash(txt,length=6):
    """A function that returns a small hash of a string

    Args:
        txt (_type_): string to hash
        length (int, optional): length of hash. Defaults to 6.

    Returns:
        _type_: _description_
    """
    return hashlib.sha1(txt.encode()).hexdigest()[:length]

In [None]:
#| export
SPAN_REPR_FORMAT = '''[@{doc},{start},{end}) "{text}"'''
SPAN_TEXT_HEAD_NUM = 10

def set_span_repr_format(format=None,head:int=None):
    """
    Sets the representation format for spans and the number of characters to display in the span text.

    Parameters:
        format (str, optional): The representation format for spans. Defaults to None.
        head (int, optional): The number of characters to display in the span text. Defaults to None.
    """
    global SPAN_REPR_FORMAT, SPAN_TEXT_HEAD_NUM
    if format is not None:
        SPAN_REPR_FORMAT = format
    if head is not None:
        SPAN_TEXT_HEAD_NUM = head

def get_span_repr_format() -> str:
    """
    Returns the span representation format.

    Returns:
        (the span representation format, the number of characters to display in the span text)
    """
    return SPAN_REPR_FORMAT, SPAN_TEXT_HEAD_NUM


In [None]:
#| export
from enum import Enum
from typing import Any
from pydantic import ConfigDict

# we will have an ie function that casts a span to its string for viewing while developing - TODO

# whether we get a document as a string or as a file, we assume that it remains immutable throughout the process - TODO explain
# a user can access the original document through the span interface (currently we dont do disk caching etc so it will just be a string and not a document class) - TODO explain

class Span():
    def __init__(self,doc,start=None,end=None,name=None):

        if isinstance(doc,Span):
            father = doc
            sub_span = doc.slice(start,end)
            self.doc = sub_span.doc
            self.start = sub_span.start
            self.end = sub_span.end
            self.name = sub_span.name
        
        else:
            self.doc = doc
            if start is None:
                start = 0
            if end is None:
                end = len(doc)
            self.start = start
            self.end = end

            if name is None:
                name = small_hash(doc)
            self.name = name

    
    def slice(self, start,end):
        if start < 0 or end < 0:
            raise ValueError(f'Negative indices not supported, got start: {start}, end: {end}')
        if start > end:
            raise ValueError(f'Start index greater than end index, got start: {start}, end: {end}')
        if end > len(self):
            raise ValueError(f'End index greater than length of span, got end: {end}, length: {len(self)}')
        return Span(self.doc,self.start+start,self.start+end,name=self.name)

    def __lt__(self, other) -> bool:
        return (self.doc, self.start, self.end) <= (other.doc, other.start, other.end)
    
    def __repr__(self):
        f_string,head_num = get_span_repr_format()
        text = self.doc[self.start:self.end]
        if len(text) > head_num:
            text = text[:head_num] + '...'
        return SPAN_REPR_FORMAT.format(doc=self.name,start=self.start,end=self.end,text=text)

    def __len__(self):
        return self.end-self.start

    def __str__(self):
        return repr(self)
        # return self.doc[self.start:self.end]

    def as_str(self):
        return self.doc[self.start:self.end]

    def __eq__(self, value: object) -> bool:
        if isinstance(value, Span):
            return self.start == value.start and self.end == value.end and self.doc == value.doc
        elif isinstance(value, str):
            return self.as_str() == value
        else:
            return False

    @classmethod
    def from_val(cls,val):
        if isinstance(val,Span):
            return val
        if isinstance(val, (list, tuple)) and len(val) == 2:
            return Span(start=val[0], end=val[1])
        raise ValueError('Invalid value to create Vector from: {}'.format(val))
    
    # # used for sorting `Span`s in dataframes
    def __hash__(self) -> int:
        return hash((self.doc,self.start, self.end))

def ie(s:Span)->(int,int):
    return s.start,s.end

In [None]:
string = "hello stranger"
short_string = "hi"

In [None]:
s = Span(string)
display(s)

[@5ca31c,0,14) "hello stra..."

In [None]:
df = pd.DataFrame({'span':[s]})
df

Unnamed: 0,span
0,"[@5ca31c,0,14) ""hello stra..."""


In [None]:
s2 = Span(short_string)
display(s2)

[@c22b5f,0,2) "hi"

In [None]:
assert s == 'hello stranger'
assert s.slice(0,5) == 'hello'
assert not s == s.slice(0,5)
assert f"{s.slice(0,5).as_str()} darkness" == 'hello darkness'
assert s.slice(0,5).slice(1,4) == 'ell'

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()
     

In [None]:
# #| export
# import numpy as np
# from pandas.core.dtypes.dtypes import PandasExtensionDtype
# from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin, register_extension_dtype

# @register_extension_dtype
# class SpanDtype(PandasExtensionDtype):
#     """
#     Class to describe the custom Vector data type
#     """
#     type = Span       # Scalar type for data
#     name = 'span'     # String identifying the data type name 

#     @classmethod
#     def construct_array_type(cls):
#         """
#         Return array type associated with this dtype
#         """
#         return SpanArray

#     def __str__(self):
#         return self.name
    
#     def __hash__(self):
#         return hash(self.name)

In [None]:
# #| export
# from parse import parse,compile
# SpanParser = compile('[{start:d},{end:d})')


In [None]:
# SpanParser.parse('[3,4)')

In [None]:
# #| export
# class SpanArray(ExtensionScalarOpsMixin, ExtensionArray):
#     """
#     Custom Extension Array type for an array of Vectors
#     Needs to define:
#     - Associated Dtype it is used with
#     - How to construct array from sequence of scalars
#     - How data is stored and accessed
#     - Any custom array methods
#     """

#     def __init__(self, x_values, y_values, copy=False):
#         """
#         Initialise array of vectors from component X and Y values 
#         (Allows efficient initialisation from existing lists/arrays)
#         :param x_values: Sequence/array of vector x-component values
#         :param y_values: Sequence/array of vector y-component values
#         """
#         self.x_values = np.array(x_values, dtype=np.int64, copy=copy)
#         self.y_values = np.array(y_values, dtype=np.int64, copy=copy)

#     # TODO this doesnt work for spans with documents
#     # @classmethod
#     # def _from_sequence_of_strings(
#     #     cls, strings, *, dtype=SpanDtype, copy: bool = False
#     # ):
#     #     vals=[]
#     #     for string in strings:
#     #         parsed_span = SpanParser.parse(string)
#     #         if parsed_span is None:
#     #             raise ValueError(f'could not parse string "{string}" as a span')
#     #         vals.append(Span(parsed_span['start'],parsed_span['end']))
            
#     #     return cls._from_sequence(vals)

#     @classmethod
#     def _from_sequence(cls, scalars, *, dtype=None, copy=False):
#         """
#         Construct a new ExtensionArray from a sequence of scalars. 
#         Each element will be an instance of the scalar type for this array,
#         or be converted into this type in this method.
#         """
#         # Construct new array from sequence of values (Unzip vectors into x and y components)
#         x_values, y_values = zip(*[Span.from_val(val).as_tuple() for val in scalars])
#         return SpanArray(x_values, y_values, copy=copy)

#     @classmethod
#     def from_vectors(cls, vectors):
#         """
#         Construct array from sequence of values (vectors)
#         Can be provided as Vector instances or list/tuple like (x, y) pairs
#         """
#         return cls._from_sequence(vectors)

#     @classmethod
#     def _concat_same_type(cls, to_concat):
#         """
#         Concatenate multiple arrays of this dtype
#         """
#         return SpanArray(
#             np.concatenate([arr.x_values for arr in to_concat]),
#             np.concatenate([arr.y_values for arr in to_concat]),
#         )

#     @property
#     def dtype(self):
#         """
#         Return Dtype instance (not class) associated with this Array
#         """
#         return SpanDtype()

#     @property
#     def nbytes(self):
#         """
#         The number of bytes needed to store this object in memory.
#         """
#         return self.x_values.nbytes + self.y_values.nbytes

#     def __getitem__(self, item):
#         """
#         Retrieve single item or slice
#         """
#         if isinstance(item, int):
#             # Get single vector
#             return Span(self.x_values[item], self.y_values[item])

#         else:
#             # Get subset from slice  or boolean array
#             return SpanArray(self.x_values[item], self.y_values[item])

#     def __eq__(self, other):
#         """
#         Perform element-wise equality with a given vector value
#         """
#         if isinstance(other, (pd.Index, pd.Series, pd.DataFrame)):
#             return NotImplemented

#         return (self.x_values == other[0]) & (self.y_values == other[1])

#     def __len__(self):
#         return self.x_values.size

#     def isna(self):
#         """
#         Returns a 1-D array indicating if each value is missing
#         """
#         return np.isnan(self.x_values)

#     def take(self, indices, *, allow_fill=False, fill_value=None):
#         """
#         Take element from array using positional indexing
#         """
#         from pandas.core.algorithms import take
#         if allow_fill and fill_value is None:
#             fill_value = self.dtype.na_value

#         x_result = take(self.x_values, indices, fill_value=fill_value, allow_fill=allow_fill)
#         y_result = take(self.y_values, indices, fill_value=fill_value, allow_fill=allow_fill)
#         return SpanArray(x_result, y_result)

#     def copy(self):
#         """
#         Return copy of array
#         """
#         return SpanArray(np.copy(self.x_values), np.copy(self.y_values))

# # Register operator overloads using logic defined in Vector class
# SpanArray._add_comparison_ops()

In [None]:
# s = pd.Series([[1,2],[3,4],[5,6]],dtype='span')
# df = s.to_frame()

# df.info()


In [None]:
# df = pd.DataFrame([
#     [Span(1,3),Span(2,4)],
#     [Span(1,3),Span(2,4)]
#     ], columns=['x','y'])
# df

In [None]:
# df.infer_objects()

In [None]:
# df.to_csv('file.txt')

In [None]:
# assert SpanArray._from_sequence_of_strings(['[0,1)','[0,2)'])[0] == Span(0,1)

In [None]:
# df2=pd.read_csv('file.txt',index_col=0,
#     dtype={'x':'span'}
#     )
# df2.info()