In [1]:
import numpy as np
import pandas as pd
import os
import sqlite3
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
plt.rc("axes.spines", top=False, right=False)
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 100

In [2]:
DATA_DIR = '../output'

# Connect SQLite Data-Warehouse

In [3]:
# SQLite Datenbank Pfad 
SQLITE_FILE_PATH = os.path.join("..", "output", "dwh.sqlite3")

In [4]:
# Connection zur Datenbank 
con = sqlite3.connect(SQLITE_FILE_PATH)

In [5]:
query_test = """
SELECT *
FROM marktstammdaten
"""
#marktstammdaten = pd.read_sql_query(query_test, con=con)
#marktstammdaten

# OLS-Regression - Betrachtung einzelner Faktoren
**Analyse möglicher Einflussfaktoren für den Solar-Ausbau**

## Bruttoinlandsprodukt BIP

In [6]:
query_bip = """
SELECT m.[AGS-5], COUNT(m.[AGS-5]) AS Anzahl_PV_Anlagen, b.BIP
FROM marktstammdaten AS m
JOIN bip AS b
ON m.[AGS-5] = b.[AGS-5]
GROUP BY m.[AGS-5], b.BIP
"""
bip_df = pd.read_sql_query(query_bip, con=con)
bip_df.head(3)

Unnamed: 0,AGS-5,Anzahl_PV_Anlagen,BIP
0,1001,1252,63.55
1,1002,1970,68.26
2,1003,2565,72.59


In [7]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ BIP", data=bip_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.027
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                     10.87
Date:                Fri, 19 Apr 2024   Prob (F-statistic):            0.00107
Time:                        14:50:31   Log-Likelihood:                -4018.5
No. Observations:                 400   AIC:                             8041.
Df Residuals:                     398   BIC:                             8049.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2206.1922   1759.349      1.254      0.2

## Entwicklung der Bevölkerungszahl

In [8]:
query_bevoelkerungsentwicklung = """
SELECT m.[AGS-8], COUNT(m.[AGS-8]) AS Anzahl_PV_Anlagen, b.[Bev_Entwicklung_%]
FROM marktstammdaten AS m
JOIN bevoelkerungsentwicklung AS b
ON m.[AGS-8] = b.[AGS-8]
GROUP BY m.[AGS-8], b.[Bev_Entwicklung_%]
"""
bevoelkerungsentwicklung_df = pd.read_sql_query(query_bevoelkerungsentwicklung, con=con)
bevoelkerungsentwicklung_df.rename(columns={"Bev_Entwicklung_%": "Bev_Entwicklung"}, inplace=True)
bevoelkerungsentwicklung_df.head(3)

Unnamed: 0,AGS-8,Anzahl_PV_Anlagen,Bev_Entwicklung
0,1001000,1252,0.84
1,1002000,1970,-0.1
2,1003000,2565,-0.04


In [9]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Bev_Entwicklung", data=bevoelkerungsentwicklung_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     3.352
Date:                Fri, 19 Apr 2024   Prob (F-statistic):             0.0671
Time:                        14:50:35   Log-Likelihood:                -85287.
No. Observations:               11107   AIC:                         1.706e+05
Df Residuals:                   11105   BIC:                         1.706e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept         285.9606      4.966     

## Bevölkerungsdichte

In [10]:
query_bevoelkerungsdichte = """
SELECT m.[AGS-5], COUNT(m.[AGS-5]) AS Anzahl_PV_Anlagen, b.[Bevölkerung pro km2]
FROM marktstammdaten AS m
JOIN bevoelkerungsdichte AS b
ON m.[AGS-5] = b.[AGS-5]
GROUP BY m.[AGS-5], b.[Bevölkerung pro km2]
"""
bevoelkerungsdichte_df = pd.read_sql_query(query_bevoelkerungsdichte, con=con)
bevoelkerungsdichte_df.rename(columns={"Bevölkerung pro km2": "Bev_Dichte"}, inplace=True)
bevoelkerungsdichte_df.head(3)

Unnamed: 0,AGS-5,Anzahl_PV_Anlagen,Bev_Dichte
0,1001,1252,1606.0
1,1002,1970,2075.0
2,1003,2565,1010.0


In [11]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Bev_Dichte", data=bevoelkerungsdichte_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.070
Model:                            OLS   Adj. R-squared:                  0.068
Method:                 Least Squares   F-statistic:                     30.18
Date:                Fri, 19 Apr 2024   Prob (F-statistic):           7.04e-08
Time:                        14:50:38   Log-Likelihood:                -4009.2
No. Observations:                 400   AIC:                             8022.
Df Residuals:                     398   BIC:                             8030.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   9070.9608    343.060     26.441      0.0

## Eigenheimquote

In [12]:
query_wohneigentum = """
SELECT m.[AGS-8], COUNT(m.[AGS-8]) AS Anzahl_PV_Anlagen, e.[Eigentum_%]
FROM marktstammdaten AS m
JOIN eigenheimquote AS e
ON m.[AGS-8] = e.[AGS-8]
GROUP BY m.[AGS-8], e.[Eigentum_%]
"""
wohneigentum_df = pd.read_sql_query(query_wohneigentum, con=con)
wohneigentum_df.rename(columns={"Eigentum_%": "Eigentumsquote"}, inplace=True)
wohneigentum_df.head(3)

Unnamed: 0,AGS-8,Anzahl_PV_Anlagen,Eigentumsquote
0,1001000,1252,48.2
1,1002000,1970,44.6
2,1003000,2565,49.4


In [13]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Eigentumsquote", data=wohneigentum_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.080
Model:                            OLS   Adj. R-squared:                  0.080
Method:                 Least Squares   F-statistic:                     961.3
Date:                Fri, 19 Apr 2024   Prob (F-statistic):          1.66e-202
Time:                        14:50:42   Log-Likelihood:                -84828.
No. Observations:               11107   AIC:                         1.697e+05
Df Residuals:                   11105   BIC:                         1.697e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept       1054.0483     25.235     41.

## "Grüne"-Wähler bei der Bundestagswahl 2021

In [14]:
query_gruene = """
SELECT m.[AGS-8], COUNT(m.[AGS-8]) AS Anzahl_PV_Anlagen, AVG(w.[GRÜNE_Prozent]) AS Gruene_Prozent
FROM marktstammdaten AS m
JOIN bundestagswahlergenisse AS w
ON m.[AGS-8] = w.[AGS-8]
GROUP BY m.[AGS-8], w.[GRÜNE_Prozent]
"""
gruene_df = pd.read_sql_query(query_gruene, con=con)
gruene_df.head(3)

Unnamed: 0,AGS-8,Anzahl_PV_Anlagen,Gruene_Prozent
0,1001000,1252,35.575106
1,1002000,1970,28.713637
2,1003000,2565,22.054262


In [15]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Gruene_Prozent", data=gruene_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.070
Model:                            OLS   Adj. R-squared:                  0.070
Method:                 Least Squares   F-statistic:                     841.4
Date:                Fri, 19 Apr 2024   Prob (F-statistic):          2.21e-178
Time:                        14:50:46   Log-Likelihood:                -84884.
No. Observations:               11107   AIC:                         1.698e+05
Df Residuals:                   11105   BIC:                         1.698e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         57.4528      9.210      6.

## "AFD"-Wähler bei der Bundestagswahl 2021

In [16]:
query_afd = """
SELECT m.[AGS-8], COUNT(m.[AGS-8]) AS Anzahl_PV_Anlagen, AVG(w.[AFD_Prozent]) AS AFD_Prozent
FROM marktstammdaten AS m
JOIN bundestagswahlergenisse AS w
ON m.[AGS-8] = w.[AGS-8]
GROUP BY m.[AGS-8], w.[AFD_Prozent]
"""
afd_df = pd.read_sql_query(query_afd, con=con)
afd_df.head(3)

Unnamed: 0,AGS-8,Anzahl_PV_Anlagen,AFD_Prozent
0,1001000,1252,5.276728
1,1002000,1970,4.696032
2,1003000,2565,6.434268


In [17]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ AFD_Prozent", data=afd_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.036
Model:                            OLS   Adj. R-squared:                  0.036
Method:                 Least Squares   F-statistic:                     416.7
Date:                Fri, 19 Apr 2024   Prob (F-statistic):           5.84e-91
Time:                        14:50:49   Log-Likelihood:                -85085.
No. Observations:               11107   AIC:                         1.702e+05
Df Residuals:                   11105   BIC:                         1.702e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     435.5569      8.813     49.424      

## "SPD"-Wähler bei der Bundestagswahl 2021

In [18]:
query_spd = """
SELECT m.[AGS-8], COUNT(m.[AGS-8]) AS Anzahl_PV_Anlagen, AVG(w.[SPD_Prozent]) AS SPD_Prozent
FROM marktstammdaten AS m
JOIN bundestagswahlergenisse AS w
ON m.[AGS-8] = w.[AGS-8]
GROUP BY m.[AGS-8], w.[SPD_Prozent]
"""
spd_df = pd.read_sql_query(query_spd, con=con)
spd_df.head(3)

Unnamed: 0,AGS-8,Anzahl_PV_Anlagen,SPD_Prozent
0,1001000,1252,20.634546
1,1002000,1970,29.632519
2,1003000,2565,34.404717


In [19]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ SPD_Prozent", data=spd_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.024
Date:                Fri, 19 Apr 2024   Prob (F-statistic):              0.312
Time:                        14:50:53   Log-Likelihood:                -85289.
No. Observations:               11107   AIC:                         1.706e+05
Df Residuals:                   11105   BIC:                         1.706e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     299.4195     14.459     20.708      

## Durchschnittliches verfügbares Einkommen (2005 bis 2020)

In [20]:
query_einkommen = """
SELECT m.[AGS-5], COUNT(m.[AGS-5]) AS Anzahl_PV_Anlagen, 
AVG((haushaltseink_2005 + haushaltseink_2006 + haushaltseink_2007 + haushaltseink_2008 + haushaltseink_2009 + 
haushaltseink_2010 + haushaltseink_2011 + haushaltseink_2012 + haushaltseink_2013 + haushaltseink_2014 + 
haushaltseink_2015 + haushaltseink_2016 + haushaltseink_2017 + haushaltseink_2018 + haushaltseink_2019 + 
haushaltseink_2020) / 16) AS Durchschnitt_Einkommen
FROM marktstammdaten AS m
JOIN einkommensentwicklung AS e
ON m.[AGS-5] = e.[AGS-5]
GROUP BY m.[AGS-5]
"""
einkommen_df = pd.read_sql_query(query_einkommen, con=con)
einkommen_df.head(3)

Unnamed: 0,AGS-5,Anzahl_PV_Anlagen,Durchschnitt_Einkommen
0,1001,1252,17886.0
1,1002,1970,17739.0
2,1003,2565,18338.0


In [21]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Durchschnitt_Einkommen", data=einkommen_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.060
Method:                 Least Squares   F-statistic:                     26.38
Date:                Fri, 19 Apr 2024   Prob (F-statistic):           4.40e-07
Time:                        14:50:59   Log-Likelihood:                -4011.0
No. Observations:                 400   AIC:                             8026.
Df Residuals:                     398   BIC:                             8034.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept              -3887

## Arbeitslosenquote (2022)

In [22]:
query_arbeitslosenquote = """
SELECT m.[Bundesland], COUNT(m.[Bundesland]) AS Anzahl_PV_Anlagen, a.[Arbeitslosenquote_%] AS Arbeitslosenquote
FROM marktstammdaten AS m
JOIN arbeitslosenquote AS a
ON m.[Bundesland] = a.[Bundesland]
GROUP BY m.[Bundesland], a.[Arbeitslosenquote_%]
"""
arbeitslosenquote_df = pd.read_sql_query(query_arbeitslosenquote, con=con)
arbeitslosenquote_df.head(3)

Unnamed: 0,Bundesland,Anzahl_PV_Anlagen,Arbeitslosenquote
0,Baden-Württemberg,545291,3.5
1,Bayern,844437,3.1
2,Berlin,21110,8.8


In [23]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Arbeitslosenquote", data=arbeitslosenquote_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.378
Model:                            OLS   Adj. R-squared:                  0.334
Method:                 Least Squares   F-statistic:                     8.521
Date:                Fri, 19 Apr 2024   Prob (F-statistic):             0.0112
Time:                        14:51:01   Log-Likelihood:                -216.94
No. Observations:                  16   AIC:                             437.9
Df Residuals:                      14   BIC:                             439.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept          6.968e+05   1.78e+0



## Ausländeranteil (2021)

In [24]:
query_auslaenderanteil = """
SELECT m.[AGS-5], COUNT(m.[AGS-5]) AS Anzahl_PV_Anlagen, 
a.[Auslaender_%] AS Ausländeranteil
FROM marktstammdaten AS m
JOIN auslaenderanteil AS a
ON m.[AGS-5] = a.[AGS-5]
GROUP BY m.[AGS-5]
"""
auslaenderanteil_df = pd.read_sql_query(query_auslaenderanteil, con=con)
auslaenderanteil_df.head(3)

Unnamed: 0,AGS-5,Anzahl_PV_Anlagen,Ausländeranteil
0,1001,1252,16.99
1,1002,1970,12.5
2,1003,2565,11.21


In [25]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Ausländeranteil", data=auslaenderanteil_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1.676
Date:                Fri, 19 Apr 2024   Prob (F-statistic):              0.196
Time:                        14:51:04   Log-Likelihood:                -4023.0
No. Observations:                 400   AIC:                             8050.
Df Residuals:                     398   BIC:                             8058.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept        8690.0123    649.898     

## Bildungsstand: ohne Hauptschulabschluss und / oder ohne Berufsausbildung (2021)

In [26]:
query_bildung1 = """
SELECT m.[AGS-5], COUNT(m.[AGS-5]) AS Anzahl_PV_Anlagen, 
(b.bquali_oabschl) + (b.schule_oabschl) AS ohne_Abschluss
FROM marktstammdaten AS m
JOIN bildung AS b
ON m.[AGS-5] = b.[AGS-5]
GROUP BY m.[AGS-5]
"""
bildung1_df = pd.read_sql_query(query_bildung1, con=con)
bildung1_df.head(3)

Unnamed: 0,AGS-5,Anzahl_PV_Anlagen,ohne_Abschluss
0,1001,1252,22.99
1,1002,1970,20.75
2,1003,2565,21.59


In [27]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ ohne_Abschluss", data=bildung1_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.002
Method:                 Least Squares   F-statistic:                   0.04020
Date:                Fri, 19 Apr 2024   Prob (F-statistic):              0.841
Time:                        14:51:08   Log-Likelihood:                -4023.8
No. Observations:                 400   AIC:                             8052.
Df Residuals:                     398   BIC:                             8060.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept       8274.3542   1728.363      4.

## Bildungsstand: anerkannter Berufsabschluss und / oder akademischer Abschluss (2021)

In [28]:
query_bildung2 = """
SELECT m.[AGS-5], COUNT(m.[AGS-5]) AS Anzahl_PV_Anlagen, 
(b.bquali_unifh) + (b.bquali_mabschl) AS mit_Abschluss
FROM marktstammdaten AS m
JOIN bildung AS b
ON m.[AGS-5] = b.[AGS-5]
GROUP BY m.[AGS-5]
"""
bildung2_df = pd.read_sql_query(query_bildung2, con=con)
bildung2_df.head(3)

Unnamed: 0,AGS-5,Anzahl_PV_Anlagen,mit_Abschluss
0,1001,1252,77.99
1,1002,1970,79.85
2,1003,2565,77.66


In [29]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ mit_Abschluss", data=bildung2_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.038
Model:                            OLS   Adj. R-squared:                  0.036
Method:                 Least Squares   F-statistic:                     15.85
Date:                Fri, 19 Apr 2024   Prob (F-statistic):           8.16e-05
Time:                        14:51:11   Log-Likelihood:                -4016.0
No. Observations:                 400   AIC:                             8036.
Df Residuals:                     398   BIC:                             8044.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      2.889e+04   5271.907      5.480

## E-Ladesäulen pro 100.000 Einwohner

In [30]:
query_ladesaeule = """
SELECT m.[AGS-5], COUNT(m.[AGS-5]) AS Anzahl_PV_Anlagen, 
e.eLade AS Ladepunkt
FROM marktstammdaten AS m
JOIN [e-mobility] AS e
ON m.[AGS-5] = e.[AGS-5]
GROUP BY m.[AGS-5]
"""
ladesaeule_df = pd.read_sql_query(query_ladesaeule, con=con)
ladesaeule_df.head(3)

Unnamed: 0,AGS-5,Anzahl_PV_Anlagen,Ladepunkt
0,1001,1252,117.44
1,1002,1970,99.5
2,1003,2565,61.96


In [31]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Ladepunkt", data=ladesaeule_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.001
Date:                Fri, 19 Apr 2024   Prob (F-statistic):              0.318
Time:                        14:51:14   Log-Likelihood:                -4023.4
No. Observations:                 400   AIC:                             8051.
Df Residuals:                     398   BIC:                             8059.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   7513.9765    505.130     14.875      0.0

## Anteil E-Autos an allen PkW

In [32]:
query_eauto = """
SELECT m.[AGS-5], COUNT(m.[AGS-5]) AS Anzahl_PV_Anlagen, 
e.eauto AS Anteil_eAuto
FROM marktstammdaten AS m
JOIN [e-mobility] AS e
ON m.[AGS-5] = e.[AGS-5]
GROUP BY m.[AGS-5]
"""
eauto_df = pd.read_sql_query(query_eauto, con=con)
eauto_df.head(3)

Unnamed: 0,AGS-5,Anzahl_PV_Anlagen,Anteil_eAuto
0,1001,1252,1.63
1,1002,1970,1.37
2,1003,2565,1.15


In [33]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Anteil_eAuto", data=eauto_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     8.167
Date:                Fri, 19 Apr 2024   Prob (F-statistic):            0.00449
Time:                        14:51:17   Log-Likelihood:                -4019.8
No. Observations:                 400   AIC:                             8044.
Df Residuals:                     398   BIC:                             8052.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     5921.3387    757.660      7.815   

## Anteil fertiggestellter Wohnungen mit primär erneuerbarer Heizenergie

In [34]:
query_heizung = """
SELECT m.[AGS-5], COUNT(m.[AGS-5]) AS Anzahl_PV_Anlagen, 
e.[Heizung_Erneuerbare_%] AS Anteil_Erneuerbare
FROM marktstammdaten AS m
JOIN erneuerbare_energien AS e
ON m.[AGS-5] = e.[AGS-5]
GROUP BY m.[AGS-5]
"""
heizung_df = pd.read_sql_query(query_heizung, con=con)
heizung_df.head(3)

Unnamed: 0,AGS-5,Anzahl_PV_Anlagen,Anteil_Erneuerbare
0,1001,1252,0.3
1,1002,1970,7.9
2,1003,2565,24.9


In [35]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Anteil_Erneuerbare", data=heizung_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.099
Model:                            OLS   Adj. R-squared:                  0.097
Method:                 Least Squares   F-statistic:                     43.75
Date:                Fri, 19 Apr 2024   Prob (F-statistic):           1.20e-10
Time:                        14:51:20   Log-Likelihood:                -4003.0
No. Observations:                 400   AIC:                             8010.
Df Residuals:                     398   BIC:                             8018.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept           3791.1398    681

## Sonnenstunden

In [36]:
query_sonne = """
SELECT m.[AGS-5], COUNT(m.[AGS-5]) AS Anzahl_PV_Anlagen, 
s.Sonnenstunden AS Sonnenstunden
FROM marktstammdaten AS m
JOIN sonnenstunden_plz AS s
ON m.[Postleitzahl] = s.[Postleitzahl]
GROUP BY m.[Postleitzahl]
"""
sonne_df = pd.read_sql_query(query_sonne, con=con)
sonne_df.head(3)

Unnamed: 0,AGS-5,Anzahl_PV_Anlagen,Sonnenstunden
0,14612,41,1718.1
1,14612,64,1718.1
2,14612,60,1718.1


In [37]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Sonnenstunden", data=sonne_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     7.952
Date:                Fri, 19 Apr 2024   Prob (F-statistic):            0.00482
Time:                        14:51:23   Log-Likelihood:                -59513.
No. Observations:                8160   AIC:                         1.190e+05
Df Residuals:                    8158   BIC:                         1.190e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept       614.5242     80.128      7.669

**Ergebnisse**   
Keine der betrachteten möglichen Einflussfaktoren hat für sich genommen einen sehr hohen Impact auf die Anzahl    
der Installierten PV-Anlagen in der jeweiligen Region.    
   
**Die höchsten R-squared Werte mit über 5% erreichten:**       
Anteil fertiggestellter Wohnungen mit primär erneuerbarer Heizenergie (R-squared 0.099)   
Eigenheimquote (R-squared 0.080) 
"Grüne"-Wähler bei der Bundestagswahl 2021 (R-squared 0.070)  
Bevölkerungsdichte (R-squared 0.070)
Durchschnittliches verfügbares Einkommen (R-squared 0.062)   
    
Die Arbeitslosenquote könnte relevant sein (R-squared 0.378). Da die Betrachtung sich jedoch nur auf 16 Werte für die einzelnen Bundesländer bezieht, ist das Ergebnis nicht aussagekräftig.

# OLS-Regression - Betrachtung kombinierter Faktoren

## Eigentumsquote + Bevölkerungsdichte

In [38]:
query_div1 = """
SELECT m.[AGS-5], m.[AGS-8], COUNT(m.[AGS-8]) AS Anzahl_PV_Anlagen, 
e.[Eigentum_%] AS Eigentumsquote, 
b.[Bevölkerung pro km2] AS Bevoelkerungsdichte
FROM marktstammdaten AS m
JOIN eigenheimquote AS e
ON m.[AGS-8] = e.[AGS-8]
JOIN bevoelkerungsdichte AS b
ON m.[AGS-5] = b.[AGS-5]
GROUP BY m.[AGS-8], e.[Eigentum_%]
"""
div1_df = pd.read_sql_query(query_div1, con=con)
div1_df.head(3)

Unnamed: 0,AGS-5,AGS-8,Anzahl_PV_Anlagen,Eigentumsquote,Bevoelkerungsdichte
0,1001,1001000,1252,48.2,1606.0
1,1002,1002000,1970,44.6,2075.0
2,1003,1003000,2565,49.4,1010.0


In [39]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Eigentumsquote + Bevoelkerungsdichte", data=div1_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.391
Model:                            OLS   Adj. R-squared:                  0.391
Method:                 Least Squares   F-statistic:                     3571.
Date:                Fri, 19 Apr 2024   Prob (F-statistic):               0.00
Time:                        14:51:28   Log-Likelihood:                -82531.
No. Observations:               11107   AIC:                         1.651e+05
Df Residuals:                   11104   BIC:                         1.651e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept             418.9541    

## Strompreise + Einspeisevergütung

In [40]:
query_div2 = """
SELECT marktstammdaten.[Inbetriebnahmedatum], 
       COUNT(marktstammdaten.[Inbetriebnahmedatum]) AS Anzahl_PV_Anlagen,
       strompreis.[Strom_Verbraucherpreis], 
       [Einspeiseverguetung_kleine_PV_Anlagen]
FROM marktstammdaten
LEFT JOIN strompreis
ON strftime('%Y', marktstammdaten.[Inbetriebnahmedatum]) = strompreis.[Jahr]
LEFT JOIN einspeiseverguetung
ON marktstammdaten.Inbetriebnahmedatum = einspeiseverguetung.Datum

GROUP BY marktstammdaten.[Inbetriebnahmedatum];
"""
div2_df = pd.read_sql_query(query_div2, con=con)
#div2_df

In [41]:
# fehlenden Werte in Einspeiseverguetung_kleine_PV_Anlagen mit dem vorherigen Wert auffüllen
div2_df['Einspeiseverguetung_kleine_PV_Anlagen'].fillna(method='ffill', inplace=True)
div2_df

Unnamed: 0,Inbetriebnahmedatum,Anzahl_PV_Anlagen,Strom_Verbraucherpreis,Einspeiseverguetung_kleine_PV_Anlagen
0,1900-01-01 00:00:00,10,,
1,1900-02-24 00:00:00,1,,
2,1900-04-22 00:00:00,1,,
3,1904-04-13 00:00:00,1,,
4,1905-11-08 00:00:00,1,,
...,...,...,...,...
11340,2023-06-27 00:00:00,2654,46.27,7.65
11341,2023-06-28 00:00:00,2475,46.27,7.65
11342,2023-06-29 00:00:00,2017,46.27,7.65
11343,2023-06-30 00:00:00,1542,46.27,7.65


In [42]:
model = smf.ols(formula="Anzahl_PV_Anlagen ~ Strom_Verbraucherpreis + Einspeiseverguetung_kleine_PV_Anlagen", data=div2_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      Anzahl_PV_Anlagen   R-squared:                       0.337
Model:                            OLS   Adj. R-squared:                  0.336
Method:                 Least Squares   F-statistic:                     2159.
Date:                Fri, 19 Apr 2024   Prob (F-statistic):               0.00
Time:                        14:51:33   Log-Likelihood:                -65759.
No. Observations:                8514   AIC:                         1.315e+05
Df Residuals:                    8511   BIC:                         1.315e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

## Strompreis + Einspeiseverguetung + Fukushima + Solardeckel 

In [43]:
# timeseries_df Tabelle öffnen
with open('timeseries_df.pkl', 'rb') as f:
    timeseries_df = pickle.load(f)
timeseries_df

Unnamed: 0_level_0,GesamtBruttoleistung,Anzahl_Volleinspeisung,Anzahl_Teileinspeisung,Einspeiseverguetung,Strompreis,PV_Kosten_Euro_pro_kWp,Fukushima,Ukrainekrieg,Solardeckel
Jahr_Monat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2000-01-01,42308.897,6944,2620,50.62,14.92,8100,0,0,0
2000-02-01,757.749,99,25,50.62,14.92,8100,0,0,0
2000-03-01,698.163,163,21,50.62,14.92,8100,0,0,0
2000-04-01,3479.965,871,153,50.62,14.92,8100,0,0,0
2000-05-01,3285.015,709,162,50.62,14.92,8100,0,0,0
...,...,...,...,...,...,...,...,...,...
2023-08-01,2304090.367,3807,98267,7.65,46.27,1250,0,0,0
2023-09-01,1358857.379,3443,89448,7.65,46.27,1250,0,0,0
2023-10-01,2101186.830,3008,77553,7.65,46.27,1250,0,0,0
2023-11-01,1581038.041,2221,66049,7.65,46.27,1250,0,0,0


In [44]:
model = smf.ols(formula="GesamtBruttoleistung ~ Strompreis + Einspeiseverguetung + Fukushima + Solardeckel", data=timeseries_df)
results = model.fit()
print(results.summary())

                             OLS Regression Results                             
Dep. Variable:     GesamtBruttoleistung   R-squared:                       0.437
Model:                              OLS   Adj. R-squared:                  0.429
Method:                   Least Squares   F-statistic:                     55.01
Date:                  Fri, 19 Apr 2024   Prob (F-statistic):           2.82e-34
Time:                          14:51:33   Log-Likelihood:                -4084.4
No. Observations:                   288   AIC:                             8179.
Df Residuals:                       283   BIC:                             8197.
Df Model:                             4                                         
Covariance Type:              nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept     

In [45]:
# Verbindung zur Datenbank trennen
con.close()