In [5]:
import pandas as pd
from dagster_pandas.data_frame import create_table_schema_metadata_from_dataframe
from dagster import MetadataValue, TableRecord
import dagster as dg

In [6]:
test = pd.DataFrame({"namen":["boiuke", "jaap"], "eieiren":[26,4]})
test.describe()

Unnamed: 0,eieiren
count,2.0
mean,15.0
std,15.556349
min,4.0
25%,9.5
50%,15.0
75%,20.5
max,26.0


In [16]:
create_table_schema_metadata_from_dataframe(test).schema.columns

[TableColumn(name='namen', type='object', description=None, constraints=TableColumnConstraints(nullable=True, unique=False, other=[]), tags={}),
 TableColumn(name='eieiren', type='int64', description=None, constraints=TableColumnConstraints(nullable=True, unique=False, other=[]), tags={})]

In [17]:
from src.imdb_dagster.defs.assets import constants

In [18]:
dtypes = {"averageRating": pd.Float32Dtype(), "numVotes": pd.Int32Dtype()}

title_ratings_df = pd.read_csv(
    constants.TITLE_RATINGS_FILE_PATH,
    sep="\t",
    quotechar="\t",
    low_memory=False,
    dtype_backend="pyarrow",
    index_col="tconst",
    dtype=dtypes,
    na_values="\\N",
)

In [36]:
title_ratings_df

Unnamed: 0_level_0,averageRating,numVotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000001,5.7,2187
tt0000002,5.5,308
tt0000003,6.4,2284
tt0000004,5.1,196
tt0000005,6.2,3018
...,...,...
tt9916846,5.3,7
tt9916848,5.2,7
tt9916850,6.0,7
tt9916852,5.7,7


In [None]:
title_ratings = {
    "tconst": dg.TableColumn(
        name="tconst",
        type=str(type(title_ratings_df.index)),
        description="alphanumeric unique identifier of the title"
    ),

    "averageRating": dg.TableColumn(
        name="averageRating",
        type=str(title_ratings_df["averageRating"].dtype),
        description="weighted average of all the individual user ratings"
    ),

    "numVotes": dg.TableColumn(
        name="numVotes",
        type=str(title_ratings_df["numVotes"].dtype),
        description="number of votes the title has received"
        )
}

title_basics = {
    "tconst": title_ratings["tconst"],
    "titleType": dg.TableColumn(
        name="titleType",
        type=str(title_ratings_df["titleType"].dtype),
        description="The type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)"
    ),
    "primaryTitle": dg.TableColumn(
        name="primaryTitle",
        type=str(title_ratings_df["primaryTitle"].dtype),
        description="The more popular title / the title used by filmmakers on promotional materials at release"
    ),
    "originalTitle": dg.TableColumn(
        name="originalTitle",
        type=str(title_ratings_df["originalTitle"].dtype),
        description="Original title, in the original language"
    ),
    "isAdult": dg.TableColumn(
        name="isAdult",
        type=str(title_ratings_df["isAdult"].dtype),
        description="Boolean flag: 0 = non-adult title, 1 = adult title"
    ),
    "startYear": dg.TableColumn(
        name="startYear",
        type=str(title_ratings_df["startYear"].dtype),
        description="Release year of the title; for TV series, the series start year"
    ),
    "endYear": dg.TableColumn(
        name="endYear",
        type=str(title_ratings_df["endYear"].dtype),
        description="TV series end year; '\\N' for all other title types"
    ),
    "runtimeMinutes": dg.TableColumn(
        name="runtimeMinutes",
        type=str(title_ratings_df["runtimeMinutes"].dtype),
        description="Primary runtime of the title, in minutes"
    ),
    "genres": dg.TableColumn(
        name="genres",
        type=str(title_ratings_df["genres"].dtype),
        description="Up to three genres associated with the title (string array)"
    ),
}



TableColumn(name='averageRating', type='Float32', description='If the movie has been watched.', constraints=TableColumnConstraints(nullable=True, unique=False, other=[]), tags={})

In [1]:
import pandas as pd

In [7]:
df = pd.DataFrame({"tconst": ["tt1", "tt2"], "averageRating": [1.1,2.2]}).set_index("tconst")
df.keys()

Index(['averageRating'], dtype='object')

In [20]:


ALL_VALUES2 = {
    "tconst": "alphanumeric unique identifier of the title",
    "averageRating": "weighted average of all the individual user ratings",
    "numVotes": "number of votes the title has received",
    "titleType": "type/format of the title (movie, short, tvseries, etc)",
    "primaryTitle": "popular title used on promotional materials",
    "originalTitle": "original title in the original language",
    "isAdult": "0 = non‑adult title, 1 = adult title",
    "startYear": "release year; for TV series, the start year",
    "endYear": "TV series end year; '\\N' for other title types",
    "runtimeMinutes": "primary runtime of the title in minutes",
    "genres": "up to three genres associated with the title",
    "watched": "whether the movie has been watched",
    "priority":"whether the movie has priority",
    "netflix": "whether the movie is on Netflix (handmade value, no value means unknown)",
    "prime": "whether the movie is on Amazon Prime (handmade value, no value means unknown)",
    "date": "date the movie was watched",
    "enjoyment_score": "enjoyment score given after watching. 0=no enjoyment; 1=mweh; 2=fun; 3=good/cool; 4=great",
    "quality_score": "quality score given after watching. 0=bad, don't watch; 1=bad but interesting; 2=good engough; 3=good;4=great",
}

def get_table_schema2(df: pd.DataFrame):
    df = df.reset_index()
    columns = []
    for key in df.keys():
        try:
            columns.append(
                dg.TableColumn(key, str(df.dtypes[key]), ALL_VALUES2[key])
                )
        except KeyError:
            columns.append(
                dg.TableColumn(key, "unknow", "unkown datatype"),
            )
    
    records = []

    for row in df.head().to_dict(orient="records"):
        cleaned_row = {
            key: (
                value
                if isinstance(value, (str, int, float, bool, type(None)))
                else str(value)
            )
            for key, value in row.items()
        }
        records.append(dg.TableRecord(cleaned_row))
    
    return MetadataValue.table(
        records=records,
        schema=dg.TableSchema(columns=columns)
    )

In [21]:
get_table_schema2(df)

TableMetadataValue(records=[TableRecord(data={'tconst': 'tt1', 'averageRating': 1.1}), TableRecord(data={'tconst': 'tt2', 'averageRating': 2.2})], schema=TableSchema(columns=[TableColumn(name='tconst', type='object', description='alphanumeric unique identifier of the title', constraints=TableColumnConstraints(nullable=True, unique=False, other=[]), tags={}), TableColumn(name='averageRating', type='float64', description='weighted average of all the individual user ratings', constraints=TableColumnConstraints(nullable=True, unique=False, other=[]), tags={})], constraints=TableConstraints(other=[])))

In [11]:
x=1
isinstance(x,(int))

True