In [1]:
import polars as pl
from rich import print as rprint
from acavec.core.df import prepare_df

In [2]:
path = "./data/raw.csv"
works = prepare_df(path)

In [3]:
works.columns

['id',
 'title',
 'publication_year',
 'language',
 'type',
 'countries_distinct_count',
 'institutions_distinct_count',
 'corresponding_author_ids',
 'cited_by_count',
 'locations_count',
 'referenced_works_count',
 'referenced_works',
 'related_works',
 'cited_by_api_url',
 'cited_by_percentile_year.min',
 'cited_by_percentile_year.max',
 'primary_topic.display_name',
 'abstract',
 'authorships.author_position',
 'authorships.institutions',
 'authorships.countries',
 'authorships.author.display_name',
 'topics.display_name',
 'keywords.display_name',
 'concepts.display_name',
 'mesh.descriptor_name',
 'mesh.qualifier_name',
 'grants.funder',
 'grants.funder_display_name',
 'grants.award_id',
 'counts_by_year.year',
 'counts_by_year.cited_by_count',
 'institutions']

In [4]:
def format_row(row):
    return {
        'id': row['id'],
        'title': row['title'],
        'abstract': row['abstract'],
        'publication_year': row['publication_year'],
        'language': row['language'],
        'type': row['type'],
        'cited_by_count': row['cited_by_count'],
        'referenced_works': row['referenced_works'][:1],
        'related_works': row['related_works'][:1],
        'primary_topic.display_name': row['primary_topic.display_name'],
        'authorships.author.display_name': row['authorships.author.display_name'],
        'authorships': [
            {
                'name': name,
                'country': country,
                'position': position,
                'institution': institution

            }
            for name, country, position, institution in zip(row['authorships.author.display_name'], row['authorships.countries'], row['authorships.author_position'], row['institutions'])
        ],
        'topics.display_name': row['topics.display_name'],
        'keywords.display_name': row['keywords.display_name'],
        'concepts.display_name': row['concepts.display_name'],
        'mesh.descriptor_name': row['mesh.descriptor_name'],
        'mesh.qualifier_name': row['mesh.qualifier_name'],
        'grants.funder': row['grants.funder'],
        'grants.funder_display_name': row['grants.funder_display_name'],
        'grants.award_id': row['grants.award_id'],
        'citations_ts': [{
            'year': year,
            'cited_by_count': citation_count}
            for year, citation_count in zip(row['counts_by_year.year'].split("|"), row['counts_by_year.cited_by_count'].split("|"))][::-1],

    }


# test
row = works.head(10).to_dicts()[0]
rprint(format_row(row))

In [5]:
works.columns

['id',
 'title',
 'publication_year',
 'language',
 'type',
 'countries_distinct_count',
 'institutions_distinct_count',
 'corresponding_author_ids',
 'cited_by_count',
 'locations_count',
 'referenced_works_count',
 'referenced_works',
 'related_works',
 'cited_by_api_url',
 'cited_by_percentile_year.min',
 'cited_by_percentile_year.max',
 'primary_topic.display_name',
 'abstract',
 'authorships.author_position',
 'authorships.institutions',
 'authorships.countries',
 'authorships.author.display_name',
 'topics.display_name',
 'keywords.display_name',
 'concepts.display_name',
 'mesh.descriptor_name',
 'mesh.qualifier_name',
 'grants.funder',
 'grants.funder_display_name',
 'grants.award_id',
 'counts_by_year.year',
 'counts_by_year.cited_by_count',
 'institutions']

In [6]:
works_to_index = works.select(
    [
        'id',
        'title',
        'abstract',
        'publication_year',
        'language',
        'type',
        'countries_distinct_count',
        'institutions_distinct_count',
        'cited_by_count',
        'locations_count',
        'referenced_works_count',
        'authorships.author.display_name',
        'authorships.countries',
        'topics.display_name',
        'keywords.display_name',
        'concepts.display_name',
    ]
)
works_to_index.schema

Schema([('id', String),
        ('title', String),
        ('abstract', String),
        ('publication_year', Int64),
        ('language', String),
        ('type', String),
        ('countries_distinct_count', Int64),
        ('institutions_distinct_count', Int64),
        ('cited_by_count', Int64),
        ('locations_count', Int64),
        ('referenced_works_count', Int64),
        ('authorships.author.display_name', List(String)),
        ('authorships.countries', List(String)),
        ('topics.display_name', List(String)),
        ('keywords.display_name', List(String)),
        ('concepts.display_name', List(String))])

In [8]:
import lancedb
import pandas as pd
import pyarrow as pa
from lancedb.embeddings import get_registry
from typing import Optional
from lancedb.pydantic import Vector, LanceModel
from typing import List

registry = get_registry()
func = registry.get(
    "sentence-transformers").create(name="all-MiniLM-L6-v2", device="cpu")
# all-MiniLM-L6-v2 # fast best => all-mpnet-base-v2
# multi-qa-mpnet-base-cos-v1 (qa)


class Documents(LanceModel):
    """
    This schema embeds the 'abstract' field and mirrors the other columns
    from the provided schema, with the addition of a vector column.
    """
    id: str
    # Embed this column using `func`
    title: str = func.SourceField()
    abstract: Optional[str]
    # The vector column is generated automatically by the embedding function
    vector: Vector(func.ndims()) = func.VectorField()
    publication_year: Optional[int]
    language: Optional[str]
    type: Optional[str]
    countries_distinct_count: Optional[int]
    institutions_distinct_count: Optional[int]
    cited_by_count: Optional[int]
    locations_count: Optional[int]
    referenced_works_count: Optional[int]
    # authorships_author_display_name: Optional[List[str]]
    # authorships_countries: Optional[List[str]]
    # topics_display_name: Optional[List[str]]
    # keywords_display_name: Optional[List[str]]
    # concepts_display_name: Optional[List[str]]


# filter (non null titles) - polars
works_to_index = works_to_index.filter(pl.col("title").is_not_null())

uri = "./data/acavec-lancedb"
db = lancedb.connect(uri)
table = db.create_table("works", schema=Documents, mode="overwrite")
table.add(works_to_index)