# Data Cleaning and analysation of the German GHG quota

The last notebook deals with data cleaning and analysis of the scraped data.

The analysis is divided into providers that did not change their prices and those that did.

If you have a suggestion for improvement, please let me know.

In [47]:
import pandas as pd
import numpy as np
import plotly.express as px
import sqlalchemy
import ast
import json
import warnings
warnings.filterwarnings('ignore')
import datetime
import os

In [48]:
actual_month = datetime.datetime.today().strftime('%B')

In [49]:
if not os.path.exists("Monatsberichte"):
    os.mkdir("Monatsberichte")

### Read Database to create DataFrame

In [50]:
engine = sqlalchemy.create_engine('sqlite:///verivox_thg.db')

with engine.connect() as connection:
    result = connection.execute(sqlalchemy.text("""
                                                SELECT * 
                                                FROM thg
                                                WHERE Datum >= 2024 AND Bezahlmodel != 'fix';
                                                """))
    df_raw = pd.DataFrame(result)

In [51]:
df_raw

Unnamed: 0,id,Datum,Uhrzeit,Provider_id,Provider,Preis,Bezahlmodel,Eigenschaft,raw_data
0,23568,2024-01-01,00:00,67516,Elektrovorteil,102,fixflex,Auszahlung in 8 bis 12 Wochen,"{""id"": ""62baffaaaa755"", ""type"": ""offerBox"", ""p..."
1,23569,2024-01-01,00:00,67530,Geld für E-Auto,0,flex,Auszahlung in 8 bis 12 Wochen,"{""id"": ""62bbd6e752f30"", ""type"": ""offerBox"", ""p..."
2,23574,2024-01-01,00:00,67490,emobility.energy,0,flex,Auszahlung in 8 bis 16 Wochen,"{""id"": ""63e4d7df63fac"", ""type"": ""offerBox"", ""p..."
3,23578,2024-01-01,00:00,68334,THG-Experten,0,flex,Auszahlung in 8 bis 12 Wochen,"{""id"": ""63ef60c17c5e5"", ""type"": ""offerBox"", ""p..."
4,23582,2024-01-01,00:00,69114,thinkmobility.green,0,flex,Auszahlung in 8 bis 16 Wochen,"{""id"": ""655f0cbd33ca2"", ""type"": ""offerBox"", ""p..."
...,...,...,...,...,...,...,...,...,...
1835,28628,2024-06-30,06:11,67530,Geld für E-Auto,350,flex,Auszahlung in 8 bis 12 Wochen,"{""id"": ""62bbd6e752f30"", ""type"": ""offerBox"", ""p..."
1836,28630,2024-06-30,06:11,67516,Elektrovorteil,100,fixflex,Auszahlung in 8 bis 12 Wochen,"{""id"": ""62baffaaaa755"", ""type"": ""offerBox"", ""p..."
1837,28632,2024-06-30,06:11,67516,Elektrovorteil,80,direct,Auszahlung in 5 Tagen,"{""id"": ""65c08f69677cd"", ""type"": ""offerBox"", ""p..."
1838,28633,2024-06-30,06:11,67579,wirkaufendeinethg.de,100,fixflex,Auszahlung in 8 bis 12 Wochen,"{""id"": ""63c6a8605c4c6"", ""type"": ""offerBox"", ""p..."


### Format date column, clean DataFrame and delete unneeded columns and flex entries

In [52]:
df_raw['Datum'] = pd.to_datetime(df_raw['Datum'])

In [53]:
df_raw = df_raw[['Datum', 'Provider', 'Preis', 'Bezahlmodel', 'Eigenschaft']]

In [54]:
df_raw.drop_duplicates(inplace=True, ignore_index=True)

In [55]:
df_raw.reset_index(drop=True, inplace=True)

### Clean data from daily to weekly

In [56]:
df_raw['KW'] = df_raw['Datum'].apply(lambda x: x.isocalendar().week)

In [57]:
df_raw.drop(df_raw[df_raw['KW']==52].index, inplace=True)

In [58]:
df_raw.drop_duplicates(subset=['KW', 'Eigenschaft','Provider'], keep='last', inplace=True)

# Unchanged providers

In [59]:
unchanged_provider = (df_raw.groupby(by=["Provider", "Bezahlmodel", "Preis", "Eigenschaft"]).size().reset_index().groupby(by=['Provider', 'Bezahlmodel', 'Eigenschaft']).size()==1).to_frame().reset_index()

In [60]:
unchanged_provider = unchanged_provider[unchanged_provider[0]==True].drop(0, axis=1)

In [61]:
df_unchanged_prices = df_raw.groupby(by=["Provider", "Bezahlmodel", "Preis", 'Eigenschaft']).size().to_frame().reset_index().drop(0, axis=1)

In [62]:
df_unchanged = pd.merge(unchanged_provider, df_unchanged_prices, how='left', left_on=['Provider', 'Bezahlmodel', 'Eigenschaft'], right_on=['Provider', 'Bezahlmodel', 'Eigenschaft'])

In [63]:
df_unchanged.sort_values(by=['Preis'], ascending=False, inplace=True)

In [64]:
df_unchanged = pd.merge(
    df_unchanged,
    df_raw,
    how='inner',
    on=['Provider','Preis','Bezahlmodel','Eigenschaft']
).drop('Datum', axis=1)

In [65]:
df_unchanged = df_unchanged.sort_values(by=['Provider', 'Eigenschaft','KW'], ascending=True).drop_duplicates(subset=['Provider', 'Bezahlmodel', 'Eigenschaft', 'Preis'], keep='last').sort_values(by=['Preis'], ascending=False)

In [66]:
df_unchanged.to_excel(os.path.join("Monatsberichte", f"Unveränderte Angebote_Stand {actual_month}.xlsx"), index=None)

# Changed providers

In [67]:
df_changed = pd.merge(df_raw, df_unchanged, indicator=True, how='outer', on=['Provider', 'Bezahlmodel', 'Eigenschaft', 'Preis']).query('_merge=="left_only"').drop(['_merge', 'KW_y'], axis=1).rename({"KW_x":"KW"}, axis=1)

In [68]:
df_changed.sort_values(by=['Datum'], inplace=True)

df_changed = df_changed.reset_index(drop=True)

In [69]:
def define_fig_name(row):
    model = row['Bezahlmodel']
    label = row['Eigenschaft']
    prov = row['Provider']
    
    return f"{prov}_{model}_{label}"

In [70]:
df_changed['Neuer Name'] = df_changed.apply(define_fig_name, axis=1)

In [71]:
fig = px.line(df_changed,
       y='Preis',
       x='KW',
       color='Neuer Name')

In [72]:
fig.update_layout(xaxis=dict(tickmode='linear', dtick=1), 
                 legend_title_text=None)

In [73]:
fig.write_html(os.path.join("Monatsberichte", f"Angebote Verivox mit Veränderungen_Stand {actual_month}.html"))