In [21]:
from kafka import KafkaProducer
import pandas as pd
import json
from time import time
from kafka import KafkaConsumer
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import FlinkKafkaConsumer
from pyflink.common.serialization import SimpleStringSchema
from pyflink.table import EnvironmentSettings, DataTypes, TableEnvironment, StreamTableEnvironment
from pyflink.common.watermark_strategy import WatermarkStrategy
from pyflink.common.time import Duration
from pyflink.common import Types

In [2]:
df = pd.read_csv("green_tripdata_2019-10.csv",usecols=[
    'lpep_pickup_datetime',
    'lpep_dropoff_datetime',
    'PULocationID',
    'DOLocationID',
    'passenger_count',
    'trip_distance',
    'tip_amount']
)
df

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,tip_amount
0,2019-10-01 00:26:02,2019-10-01 00:39:58,112,196,1.0,5.88,0.00
1,2019-10-01 00:18:11,2019-10-01 00:22:38,43,263,1.0,0.80,0.00
2,2019-10-01 00:09:31,2019-10-01 00:24:47,255,228,2.0,7.50,0.00
3,2019-10-01 00:37:40,2019-10-01 00:41:49,181,181,1.0,0.90,0.00
4,2019-10-01 00:08:13,2019-10-01 00:17:56,97,188,1.0,2.52,2.26
...,...,...,...,...,...,...,...
476381,2019-10-31 23:30:00,2019-11-01 00:00:00,65,102,,7.04,0.00
476382,2019-10-31 23:03:00,2019-10-31 23:24:00,129,136,,0.00,0.00
476383,2019-10-31 23:02:00,2019-10-31 23:23:00,61,222,,3.90,0.00
476384,2019-10-31 23:42:00,2019-10-31 23:56:00,76,39,,3.08,0.00


In [None]:
message = df.to_dict(orient='records')
message

In [3]:
json_data = df.to_json(orient='records', lines=True)

In [4]:
topic_name = 'green-trips'

In [5]:
producer = KafkaProducer(bootstrap_servers='localhost:9092',value_serializer=lambda x: x.encode('utf-8'))

In [6]:
for row in json_data.splitlines():
    producer.send(topic_name, value=row)

producer.flush()

In [7]:
env = StreamExecutionEnvironment.get_execution_environment()

In [11]:
env.enable_checkpointing(10 * 1000)
env.set_parallelism(3)

<pyflink.datastream.stream_execution_environment.StreamExecutionEnvironment at 0x23f430f9570>

In [13]:
# Set up the table environment
settings = EnvironmentSettings.new_instance().in_streaming_mode().build()
t_env = StreamTableEnvironment.create(env, environment_settings=settings)

In [None]:
# kafka_consumer = FlinkKafkaConsumer(
#     topics= topic_name,
#     deserialization_schema=SimpleStringSchema(),
#     properties={'bootstrap.servers': 'localhost:9092'}
# )
kafka_consumer = KafkaConsumer(
    topic_name,
    bootstrap_servers='localhost:9092',
    value_deserializer=lambda x: json.loads(x.decode('utf-8'))
)

In [8]:
data = {
    "lpep_pickup_datetime": ["2025-03-15 12:30:00", "2025-03-15 13:00:00"],
    "lpep_dropoff_datetime": ["2025-03-15 12:50:00", "2025-03-15 13:20:00"],
    "PULocationID": [1, 2],
    "DOLocationID": [10, 20],
    "passenger_count": [1.0, 2.0],
    "trip_distance": [3.5, 5.2],
    "tip_amount": [2.5, 3.0],
}

df_prueba = pd.DataFrame(data)
json_data_prueba = df_prueba.to_json(orient='records', lines=True)

In [9]:
for row in json_data_prueba.splitlines():
    producer.send('prueba', value=row)

producer.flush()

In [10]:
consumer = KafkaConsumer(
    'prueba',
    bootstrap_servers='localhost:9092',
    # bootstrap_servers='redpanda-1:29092',
    value_deserializer=lambda x: json.loads(x.decode('utf-8')),
    auto_offset_reset='earliest',  # Start consuming from the earliest messages
    enable_auto_commit=True,       # Auto-commit offsets to avoid re-consuming messages
    group_id='test-group'          # Specify a consumer group
)

In [14]:
table_name = "prueba"
source_ddl = f"""
    CREATE TABLE {table_name} (
        lpep_pickup_datetime TIMESTAMP(3),
        lpep_dropoff_datetime TIMESTAMP(3),
        PULocationID INT,
        DOLocationID INT,
        passenger_count DOUBLE,
        trip_distance DOUBLE,
        tip_amount DOUBLE
    ) WITH (
        'connector' = 'kafka',
        'properties.bootstrap.servers' = 'redpanda-1:29092',
        'topic' = 'prueba',
        'scan.startup.mode' = 'earliest-offset',
        'properties.auto.offset.reset' = 'earliest',
        'format' = 'json'
    );
    """
t_env.execute_sql(source_ddl)

<pyflink.table.table_result.TableResult at 0x23f42f81e40>

In [None]:
# Example SQL query to select data from the table
# query = f"""
#     SELECT lpep_pickup_datetime, lpep_dropoff_datetime, PULocationID, DOLocationID,
#            passenger_count, trip_distance, tip_amount
#     FROM {table_name}
# """
query = f"""
    SELECT 1000 
"""
# Execute the query
result_table = t_env.sql_query(query)
result_stream = t_env.to_append_stream(result_table, Types.ROW([Types.INT()]))
result_stream.print()
env.execute("Read from Kafka Table")


In [None]:
consumer = KafkaConsumer(
    'prueba',
    bootstrap_servers='localhost:9092',  # Adjust for your setup
    auto_offset_reset='earliest',       # Start from the beginning
    enable_auto_commit=True,
    group_id='test-group'               # Set consumer group
)

print("Waiting for messages...")
for message in consumer:
    print(f"Received: {message.value}")
    break  # Exit after receiving the first message

In [None]:
message3 = consumer.poll(timeout_ms=5000)
if not message3:
    print("No messages received")
else:
    for partition, records in message3.items():
        for record in records:
            print("Received message:", record.value)

In [None]:
message3

In [None]:
records = []
for mensaje in consumer:
    print('a')
    records.append(mensaje.value)
    print('b')
    if len(records) >= 2:  # Adjust the limit for more/less messages
        break

In [None]:
# Convert the list of records into a pandas DataFrame
df_result = pd.DataFrame(records)
df_result