# Workflow für Datenanalyse und Modellierung

## 1. Import Libraries

In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import mysql.connector


## 2. Daten einlesen

In [None]:

# Beispiel: Daten aus einer CSV-Datei einlesen
file_path = "data.csv"
data = pd.read_csv(file_path)
data.head()


## 3. Daten aufbereiten

In [None]:

# Fehlende Werte entfernen
data = data.dropna()

# Duplikate entfernen
data = data.drop_duplicates()

# Neue Variable erstellen (Beispiel)
data['new_variable'] = data['existing_variable'] * 2

# Open Data Integration (Platzhalter für API-Aufruf)
# Beispiel: Daten aus einer anderen Quelle hinzufügen
# open_data = pd.read_csv("open_data.csv")
# data = data.merge(open_data, on='key_column', how='left')

data.head()


## 4. Datenspeicherung in der Datenbank MySQL

In [None]:

# Verbindung zur MySQL-Datenbank herstellen (Beispielkonfiguration)
conn = mysql.connector.connect(
    host="localhost",
    user="user",
    password="password",
    database="database_name"
)
cursor = conn.cursor()

# Tabelle erstellen und Daten einfügen (Beispiel)
# cursor.execute("CREATE TABLE IF NOT EXISTS table_name (columns_definition)")
# data.to_sql('table_name', conn, if_exists='replace', index=False)
conn.commit()
cursor.close()
conn.close()


## 5. Explorative Datenanalyse (EDA)

### Nicht-grafische Analysen

In [None]:

# Statistiken anzeigen
print(data.describe())

# Korrelationen berechnen
print(data.corr())


### Grafische Analysen

In [None]:

# Histogramme
data.hist(figsize=(10, 8))
plt.show()

# Heatmap der Korrelationen
sns.heatmap(data.corr(), annot=True, cmap="coolwarm")
plt.show()


## 6. Modellierung

In [None]:

# Daten aufteilen
X = data.drop('target', axis=1)  # Features
y = data['target']  # Zielvariable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Multiple lineare Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_preds = linear_model.predict(X_test)

# Regressionsbaum
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)
tree_preds = tree_model.predict(X_test)

# Random Forest Regression
forest_model = RandomForestRegressor(random_state=42)
forest_model.fit(X_train, y_train)
forest_preds = forest_model.predict(X_test)


## 7. Modellbewertung

In [None]:

# Funktion zur Modellbewertung
def evaluate_model(true, preds):
    r2 = r2_score(true, preds)
    rmse = mean_squared_error(true, preds, squared=False)
    return r2, rmse

# Modelle bewerten
models = {
    "Linear Regression": linear_preds,
    "Decision Tree": tree_preds,
    "Random Forest": forest_preds
}

for model_name, preds in models.items():
    r2, rmse = evaluate_model(y_test, preds)
    print(f"{model_name} -> R2: {r2:.2f}, RMSE: {rmse:.2f}")
