# Diverse Dateitypen in ein DataFrame lesen

| Dateityp | Bedeutung | Anmerkung |
| --- | --- | --- |
| CSV | Comma Separated Values | Textdatei mit Trennzeichen |
| Excel | Excel XLSX Dateiformat | Dateiformat aus der Tabellenkalkulation Microsoft Excel |
| JSON | JavaScript Object Notation | Array aus JavaScript Objekten |
| XML | Extensible Markup Language | Struktur aus Elementen und Attributen, ähnlich wie HTML |
| Parquet | Apache Parquet | [What is parquet?](https:\www.databricks.com\glossary\what-is-parquet) |
| Avro | Apache Avro | [Wikipedia](https:\de.wikipedia.org\wiki\Apache_Avro) |
| Orc | Optimized Row Columnar | [Language Manual ORC](https:\cwiki.apache.org\confluence\display\hive\languagemanual+orc) |

In [None]:
# CSV — Comma Separated Values
import pandas as pd

# Define the file path
csv_file_path = 'data/export/population.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Now, 'df' contains your data as a DataFrame
df.head()

In [None]:
# Excel
import pandas as pd

# Define the file path
excel_file_path = 'data/export/population.xlsx'

# Read the Excel file into a DataFrame
df = pd.read_excel(excel_file_path)

# Now, 'df' contains your data as a DataFrame
df.head(10)

In [None]:
# JSON – JavaScript Object Notation
import pandas as pd

# Define the file path
json_file_path = 'data/export/population.json'

# Read the JSON file into a DataFrame
df = pd.read_json(json_file_path)

# Now, 'df' contains your data as a DataFrame
df.head(10)

In [None]:
# XML – Extensible Markup Language
import pandas as pd
import xml.etree.ElementTree as ET

# Define the file path
xml_file_path = 'data/export/population.xml'

# Parse the XML file
tree = ET.parse(xml_file_path)
root = tree.getroot()

# Create an empty list to store dictionaries
data_list = []

# Loop through the XML elements and extract data
for element in root:
    data = {}  # Dictionary to store data from each element
    for sub_element in element:
        data[sub_element.tag] = sub_element.text
    data_list.append(data)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data_list, columns=['country', 'year', 'population'])

# Now, 'df' contains your XML data as a DataFrame
df.head(10)

In [None]:
# Parquet – Apache Parquet
import pandas as pd

# Define the file path
parquet_file_path = 'data/export/population-fastparquet.parquet'

# Load the Parquet file into a DataFrame
df = pd.read_parquet(parquet_file_path)

# Now, 'df' contains your data as a DataFrame
df.head(10)

In [None]:
# Avro
import pandas as pd
import fastavro

# Define the file path
avro_file_path = 'data/export/population.avro'

# Read the Avro file
with open(avro_file_path, 'rb') as avro_file:
    avro_reader = fastavro.reader(avro_file)
    data = list(avro_reader)

# Convert the Avro data to a DataFrame
df = pd.DataFrame(data)

# Now, 'df' contains your data as a DataFrame
df.head(10)

In [None]:
# ORC
# noch keine Demo vorhanden