# Spans
> Span class and how to interface it with pandas

In [None]:
#| default_exp span

In [None]:
#| hide
from nbdev.showdoc import show_doc

%load_ext autoreload
%autoreload 2

In [None]:
#| export
from abc import ABC, abstractmethod
import pytest
from pathlib import Path
import pandas as pd
from pathlib import Path
from typing import no_type_check, Set, Sequence, Any,Optional,List,Callable,Dict,Union
from pydantic import BaseModel

pd.options.future.infer_string = True


I followed [this guide](https://itnext.io/guide-to-pandas-extension-types-and-how-to-create-your-own-3b213d689c86) on how to make extension types for pandas

In [None]:
#| export
import hashlib

def small_hash(txt,length=6):
    """A function that returns a small hash of a string

    Args:
        txt (_type_): string to hash
        length (int, optional): length of hash. Defaults to 6.

    Returns:
        _type_: _description_
    """
    return hashlib.sha1(txt.encode()).hexdigest()[:length]

In [None]:
#| export
SPAN_REPR_FORMAT = '''[@{doc},{start},{end}) "{text}"'''
SPAN_TEXT_HEAD_NUM = 10

def set_span_repr_format(format=None,head:int=None):
    """
    Sets the representation format for spans and the number of characters to display in the span text.

    Parameters:
        format (str, optional): The representation format for spans. Defaults to None.
        head (int, optional): The number of characters to display in the span text. Defaults to None.
    """
    global SPAN_REPR_FORMAT, SPAN_TEXT_HEAD_NUM
    if format is not None:
        SPAN_REPR_FORMAT = format
    if head is not None:
        SPAN_TEXT_HEAD_NUM = head

def get_span_repr_format() -> str:
    """
    Returns the span representation format.

    Returns:
        (the span representation format, the number of characters to display in the span text)
    """
    return SPAN_REPR_FORMAT, SPAN_TEXT_HEAD_NUM


In [None]:
#| export
from enum import Enum
from typing import Any
from pydantic import ConfigDict
from collections import UserString
# TODO from here turn all code into having spanner act as a string with slicing, change the display option to use repr
# and add a note on it in the tutorial and the tests here

# we will have an ie function that casts a span to its string for viewing while developing - TODO

# whether we get a document as a string or as a file, we assume that it remains immutable throughout the process - TODO explain
# a user can access the original document through the span interface (currently we dont do disk caching etc so it will just be a string and not a document class) - TODO explain

class Span(UserString):
    def __init__(self,doc,start=None,end=None,name=None):
        if isinstance(doc,Span):
            father = doc
            sub_span = doc.slice(start,end)
            self.doc = sub_span.doc
            self.start = sub_span.start
            self.end = sub_span.end
            self.name = sub_span.name
        
        else:
            if isinstance(doc,Path):
                self.doc = doc.read_text()
                self.name = doc.name
            else:
                if name is None:
                    name = small_hash(doc)
                self.name = name
                self.doc = doc
                
            if start is None:
                start = 0
            if end is None:
                end = len(self.doc)
            self.start = start
            self.end = end


        super().__init__(self.as_str())


    def __getitem__(self, key):
        if isinstance(key, slice):
            return self.slice(key.start,key.stop)
        return self.doc[self.start+key]
    
    def slice(self, start=None,end=None):
        if start is None:
            start = 0
        if end is None:
            end = len(self)
        if start < 0 or end < 0:
            raise ValueError(f'Negative indices not supported, got start: {start}, end: {end}')
        if start > end:
            raise ValueError(f'Start index greater than end index, got start: {start}, end: {end}')
        if end > len(self):
            raise ValueError(f'End index greater than length of span, got end: {end}, length: {len(self)}')
        return Span(self.doc,self.start+start,self.start+end,name=self.name)

    # @property
    # def data(self):
    #     return str(self)
    
    def __repr__(self):
        f_string,head_num = get_span_repr_format()
        text = self.doc[self.start:self.end]
        if len(text) > head_num:
            text = text[:head_num] + '...'
        return SPAN_REPR_FORMAT.format(doc=self.name,start=self.start,end=self.end,text=text)

    def __len__(self):
        return self.end-self.start

    def __str__(self):
        return self.as_str()

    def as_str(self):
        return self.doc[self.start:self.end]
    
    # # used for sorting `Span`s in dataframes
    def __hash__(self) -> int:
        return hash((self.doc,self.start, self.end))

    def __lt__(self, other) -> bool:
        return (self.doc, self.start, self.end) < (other.doc, other.start, other.end)

    def __eq__(self, value: object) -> bool:
        if isinstance(value, Span):
            return self.start == value.start and self.end == value.end and self.doc == value.doc
        elif isinstance(value, str):
            return self.as_str() == value
        else:
            return False

    def as_tuple(self):
        return (self.doc,self.start, self.end)

    @classmethod
    def from_val(cls,val):
        if isinstance(val,Span):
            return val
        if isinstance(val, (list, tuple)) and len(val) == 3:
            return Span(doc=val[0],start=val[1], end=val[2])
        raise ValueError('Invalid value to create Vector from: {}'.format(val))
    


def ie(s:Span)->(int,int):
    return s.start,s.end

In [None]:
# TODO from here, change df assert equals to make span and strings equal even in sets of tuples
from spannerlib.utils import serialize_df_values
{(Span("aa",0,2),)} == {("aa",)}

False

In [None]:
Span("aa",0,2) == "aa"

True

In [None]:
doc = 'world'
df = pd.DataFrame([
    [Span('hello',0,5),1],
    [Span(doc,0,5),2],
    [Span(doc,0,5),3],
], columns=['span','num'])
df

Unnamed: 0,span,num
0,"(h, e, l, l, o)",1
1,"(w, o, r, l, d)",2
2,"(w, o, r, l, d)",3


In [None]:
doc = 'world'
df = pd.DataFrame([
    ['hello',1],
    ['world',2],
    ['world',3],
], columns=['span','num'])
df

Unnamed: 0,span,num
0,hello,1
1,world,2
2,world,3


In [None]:
#TODO from here, ok so We need union types and to make the span class print prettily
df.groupby('span').sum()

Unnamed: 0_level_0,num
span,Unnamed: 1_level_1
hello,1
world,5


In [None]:
string = "hello stranger"
short_string = "hi"

In [None]:
s = Span(string,0,len(string),name ='doc')
display(s)

[@doc,0,14) "hello stra..."

In [None]:
pd.DataFrame({'span':[s]})

Unnamed: 0,span
0,"(h, e, l, l, o, , s, t, r, a, n, g, e, r)"


In [None]:
df = pd.DataFrame({'span':[s]}).map(repr)
df

Unnamed: 0,span
0,"[@doc,0,14) ""hello stra..."""


In [None]:
s2 = Span(short_string)
display(s2)

[@c22b5f,0,2) "hi"

In [None]:
assert s == 'hello stranger'
assert s[0:5] == 'hello'
assert not s == s[0:5]
assert f"{s[0:5].as_str()} darkness" == 'hello darkness'
assert s[0:5][1:4] == 'ell'

## Pandas extesniondtype

In [None]:
# #| export
# import numpy as np
# import pandas as pd
# from pandas.core.dtypes.dtypes import PandasExtensionDtype
# from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin, register_extension_dtype

# @register_extension_dtype
# class SpanDtype(PandasExtensionDtype):
#     """
#     Class to describe the custom Vector data type
#     """
#     type = Span       # Scalar type for data
#     name = 'span'     # String identifying the data type name 

#     @classmethod
#     def construct_array_type(cls):
#         """
#         Return array type associated with this dtype
#         """
#         return SpanArray

#     def __str__(self):
#         return self.name

In [None]:
pd.factorize([1,2,3,3])

  pd.factorize([1,2,3,3])


(array([0, 1, 2, 2]), array([1, 2, 3]))

In [None]:
# #| export
# class SpanArray(ExtensionScalarOpsMixin, ExtensionArray):
#     """
#     Custom Extension Array type for an array of Vectors
#     Needs to define:
#     - Associated Dtype it is used with
#     - How to construct array from sequence of scalars
#     - How data is stored and accessed
#     - Any custom array methods
#     """

#     def __init__(self,doc_values, start_values, end_values, copy=False):
#         """
#         Initialise array of vectors from component X and Y values 
#         (Allows efficient initialisation from existing lists/arrays)
#         :param start_values: Sequence/array of vector x-component values
#         :param end_values: Sequence/array of vector y-component values
#         """
#         self.doc_values = np.array(doc_values, dtype=object, copy=copy)
#         self.start_values = np.array(start_values, dtype=np.int64, copy=copy)
#         self.end_values = np.array(end_values, dtype=np.int64, copy=copy)


#     @classmethod
#     def _from_sequence(cls, scalars, *, dtype=None, copy=False):
#         """
#         Construct a new ExtensionArray from a sequence of scalars. 
#         Each element will be an instance of the scalar type for this array,
#         or be converted into this type in this method.
#         """
#         # Construct new array from sequence of values (Unzip vectors into x and y components)
#         doc_values, start_values, end_values = zip(*[Span.from_val(val).as_tuple() for val in scalars])
#         return SpanArray(doc_values,start_values, end_values, copy=copy)

#     # @classmethod
#     # def _from_factorized(cls, values, original):
#     #     """
#     #     Reconstruct an ExtensionArray from a factorized array.
#     #     """
#     #     raise NotImplementedError

#     # def _values_for_factorize(self):
#     #     """
#     #     Return values (array of scalars) for factorizing
#     #     """
#     #     return np.array(
#     #         [hash(self[i]) for i in range(len(self))], dtype=object
#     #     ) ,np.nan


#     @classmethod
#     def from_vectors(cls, vectors):
#         """
#         Construct array from sequence of values (vectors)
#         Can be provided as Vector instances or list/tuple like (x, y) pairs
#         """
#         return cls._from_sequence(vectors)

#     @classmethod
#     def _concat_same_type(cls, to_concat):
#         """
#         Concatenate multiple arrays of this dtype
#         """
#         return SpanArray(
#             np.concatenate([arr.doc_values for arr in to_concat]),
#             np.concatenate([arr.start_values for arr in to_concat]),
#             np.concatenate([arr.end_values for arr in to_concat]),
#         )

#     @property
#     def dtype(self):
#         """
#         Return Dtype instance (not class) associated with this Array
#         """
#         return SpanDtype()

#     @property
#     def nbytes(self):
#         """
#         The number of bytes needed to store this object in memory.
#         """
#         return self.doc_values.nbytes + self.start_values.nbytes + self.end_values.nbytes

#     def __getitem__(self, item):
#         """
#         Retrieve single item or slice
#         """
#         if isinstance(item, int):
#             # Get single vector
#             return Span(self.doc_values[item],self.start_values[item], self.end_values[item])

#         else:
#             # Get subset from slice  or boolean array
#             return SpanArray(self.doc_values[item],self.start_values[item], self.end_values[item])

#     def __eq__(self, other):
#         """
#         Perform element-wise equality with a given vector value
#         """
#         if isinstance(other, (pd.Index, pd.Series, pd.DataFrame)):
#             return NotImplemented

#         return (self.doc_values == other[0]) & (self.start_values == other[1]) & (self.end_values == other[2])

#     def __len__(self):
#         return self.start_values.size

#     def isna(self):
#         """
#         Returns a 1-D array indicating if each value is missing
#         """
#         return np.isnan(self.start_values)

#     def take(self, indices, *, allow_fill=False, fill_value=None):
#         """
#         Take element from array using positional indexing
#         """
#         from pandas.core.algorithms import take
#         if allow_fill and fill_value is None:
#             fill_value = self.dtype.na_value

#         doc_result = take(self.doc_values, indices, fill_value=fill_value, allow_fill=allow_fill)
#         start_result = take(self.start_values, indices, fill_value=fill_value, allow_fill=allow_fill)
#         end_result = take(self.end_values, indices, fill_value=fill_value, allow_fill=allow_fill)
#         return SpanArray(doc_result, start_result, end_result)

#     def copy(self):
#         """
#         Return copy of array
#         """
#         return SpanArray(np.copy(self.doc_values), np.copy(self.start_values), np.copy(self.end_values))

# # Register operator overloads using logic defined in Vector class
# SpanArray._add_comparison_ops()

In [None]:
# doc = 'world'
# s = SpanArray.from_vectors([Span('hello',0,5),Span(doc,0,5),Span(doc,0,5)])
# s

In [None]:
df.sort_values(by=list(df.columns))

Unnamed: 0,span
0,"[@doc,0,14) ""hello stra..."""


In [None]:
# s._values_for_factorize()

In [None]:
doc = 'world'
df = pd.DataFrame([
    [Span('hello',0,5),1],
    [Span(doc,0,5),2],
    [Span(doc,0,5),3],
], columns=['span','num'])


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   span    3 non-null      object
 1   num     3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 180.0+ bytes


In [None]:
df.groupby('span').agg({'num':'sum'})

Unnamed: 0_level_0,num
span,Unnamed: 1_level_1
"(h, e, l, l, o)",1
"(w, o, r, l, d)",5


In [None]:
hash(Span(doc,0,5)) == hash(Span(doc,0,5))

True

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()
     