In [0]:
!pip install soda-core

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType, DateType, LongType

from soda.scan import Scan
from soda.sampler.sampler import Sampler
from soda.sampler.sample_context import SampleContext

In [0]:
class MyCustomSampler(Sampler):
    def __init__(self, table_path:str):
        super().__init__()
        self.table_path = table_path
    
    def store_sample(self,sample_context: SampleContext):
        # Retreive rows for the sample for check
        rows = sample_context.get_rows()
        schema = sample_context.get_schema()

        # Define the schema for Dataframe
        struct_fields = [StructField(col.name,self.map_type(col.type),True) for col in schema.columns]
        df_schema = StructType(struct_fields)

        # Write the rows to a temp table
        df = spark.createDataFrame(rows, schema=df_schema)
        df.write.format('csv').mode('overwrite').save(self.table_path)
        print(f"Wrote {df.count()} rows to csv path")

    def map_type(self, type: str):
        type_mapping = {
            'LongType': LongType(),
            'DateType': DateType(),
            'StringType': StringType(),
            'TimestampType': TimestampType(),
            'DoubleType': DoubleType(),
            'IntegerType': IntegerType()
        }
        return type_mapping.get(type, StringType())
