In [1]:
import pandas as pd
import sqlalchemy
import ast
import json 
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Text, inspect

In [2]:
engine = sqlalchemy.create_engine('sqlite:///verivox_thg.db')

In [3]:
metadata = MetaData()

In [4]:
# Define your table
thg = Table(
    'thg', metadata,
    Column('id', Integer, primary_key=True, autoincrement=True),
    Column('Datum', Text),
    Column('Uhrzeit', Text),
    Column('Provider_id', Integer),
    Column('Provider', Text),
    Column('Preis', Integer),
    Column('Bezahlmodel', Text),
    Column('Eigenschaft', Text),
    Column('raw_data', Text)
)

In [5]:
inspector = inspect(engine)
if not inspector.has_table(thg.name):
    # Create the table
    metadata.create_all(engine, tables=[thg])


In [6]:
engine_old = sqlalchemy.create_engine('sqlite:///old_verivox_thg.db')

In [7]:
with engine_old.connect() as connection:
    df_raw = pd.read_sql_table('thg', connection) 

In [8]:
df_raw["Eigenschaft"] = None

In [9]:
df_raw = df_raw[["id", "Datum", "Uhrzeit", "Provider_id", "Provider", "Preis", "Bezahlmodel", "Eigenschaft", "raw_data"]]

In [10]:
df_raw.drop('id', axis=1, inplace=True)

In [11]:
# drop duplicates based on all entries except time, because some dates have multiple entries for different times
df_raw.drop_duplicates(subset=['Datum', 'Provider_id', 'Provider', 'Preis', 'Bezahlmodel', 'Eigenschaft', 'raw_data'], inplace=True)

In [12]:
df_raw['raw_data'] = df_raw['raw_data'].apply(lambda x: json.dumps(ast.literal_eval(x)))

In [13]:
df_raw['Datum'] = pd.to_datetime(df_raw['Datum'])


In [14]:
df = df_raw.sort_values(by=['Datum']).reset_index(drop=True)

In [15]:
df['Datum'] = df['Datum'].dt.strftime("%Y-%m-%d")

In [16]:
df['Eigenschaft'] = df.apply(lambda df: ', '.join([i['value'] for i in json.loads(df['raw_data'])['keyMetrics'] if "Auszahlung " in i['value']]),
                             axis=1)

In [17]:
with engine.connect() as connection:
    df.to_sql('thg', connection, if_exists='append', index = False)

In [18]:
# Validate the new database
engine = sqlalchemy.create_engine('sqlite:///verivox_thg.db')

with engine.connect() as connection:
    df_val = pd.read_sql_table('thg', connection) 

In [19]:
df_val

Unnamed: 0,id,Datum,Uhrzeit,Provider_id,Provider,Preis,Bezahlmodel,Eigenschaft,raw_data
0,1,2022-07-20,20:44,67490.0,emobility.energy,325,fixflex,Auszahlung in 8 bis 12 Wochen,"{""id"": ""67491"", ""type"": ""offerBox"", ""product"":..."
1,2,2022-07-20,20:44,67557.0,Quotando,0,flex,Auszahlung in 10 bis 12 Wochen,"{""id"": ""67558"", ""type"": ""offerBox"", ""product"":..."
2,3,2022-07-20,20:44,67560.0,Quotlix,350,fix,Auszahlung in 8 bis 12 Wochen,"{""id"": ""67559"", ""type"": ""offerBox"", ""product"":..."
3,4,2022-07-20,20:44,67562.0,smartificate,300,fix,Auszahlung in 8 bis 12 Wochen,"{""id"": ""67561"", ""type"": ""offerBox"", ""product"":..."
4,5,2022-07-20,20:44,1533.0,Stadtwerke Merzig,300,fix,Auszahlung in 12 Wochen,"{""id"": ""67563"", ""type"": ""offerBox"", ""product"":..."
...,...,...,...,...,...,...,...,...,...
28600,28601,2024-06-29,04:57,67516.0,Elektrovorteil,100,fix,Auszahlung in 8 bis 12 Wochen,"{""id"": ""65b775583f604"", ""type"": ""offerBox"", ""p..."
28601,28602,2024-06-29,04:57,67516.0,Elektrovorteil,360,flex,Auszahlung in 8 bis 12 Wochen,"{""id"": ""65b777a5228f5"", ""type"": ""offerBox"", ""p..."
28602,28603,2024-06-29,04:57,67516.0,Elektrovorteil,80,direct,Auszahlung in 5 Tagen,"{""id"": ""65c08f69677cd"", ""type"": ""offerBox"", ""p..."
28603,28604,2024-06-29,04:57,67579.0,wirkaufendeinethg.de,100,fixflex,Auszahlung in 8 bis 12 Wochen,"{""id"": ""63c6a8605c4c6"", ""type"": ""offerBox"", ""p..."
