# Ein DataFrame in unterschiedliche Dateiformate ausgeben

In [None]:
import pandas as pd
df = pd.read_csv('data/population_total.csv')

# Adjusting data types
df.dropna(subset=['year'], inplace=True) 
df.dropna(subset=['population'], inplace=True) 
print(len(df))

df['year'] = df['year'].astype(int)  # Convert "year" column to int
df['population'] = df['population'].astype(int)  # Convert "population" column to int

'Data Frame geladen als df'

In [None]:
df.describe()

In [None]:
# Save DataFrame to a CSV file
df.to_csv('data/export/population.csv', index=False)
'population.csv erstellt'

In [None]:
# Save DataFrame to an Excel file
import openpyxl
df.to_excel('data/export/population.xlsx', index=False)
'population.xlsx erstellt'

In [None]:
# Save DataFrame to a JSON file
df.to_json('data/export/population.json', orient='records')  
'population.json erstellt'

In [None]:
import pandas as pd

# Specify the path to the XML file
xml_file_path = 'data/export/population.xml'

# Write the DataFrame to XML
df.to_xml(xml_file_path, root_name='data', row_name='record')
'population.xml erstellt'

In [None]:
# Save DataFrame to a Parquet file using pyarrow
df.to_parquet('data/export/population-pyarrow.parquet', index=False, engine='pyarrow')
'population-pyarrow.parquet erstellt'

In [None]:
# Save DataFrame to a Parquet file using fastparquet
df.to_parquet('data/export/population-fastparquet.parquet', index=False, engine='fastparquet')
'population-fastparquet.parquet erstellt'

In [None]:
# Save DataFrame to AVRO file
import pandas as pd
import fastavro

# Assuming df is your DataFrame
avro_schema = fastavro.parse_schema({
    'type': 'record',
    'name': 'population_data',
    'fields': [
        {'name': 'country', 'type': ['null', 'string']},
        {'name': 'year', 'type': ['null', 'int']},
        {'name': 'population', 'type': ['null', 'int']}
    ]
})

with open('data/export/population.avro', 'wb') as avro_file:
    fastavro.writer(avro_file, avro_schema, df.to_dict(orient='records'))

'population.avro erstellt'    

## ORC benötigt eine korrekt konfigurierte HADOOP Umgebung

Dieses Beispiel benötigt weitere Vorbereitung

In [None]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("DataFrameToORC").getOrCreate()

# Create a Spark DataFrame from the Pandas DataFrame
spark_df = spark.createDataFrame(df)

# Specify the path to the ORC file
orc_file_path = "data/export/population.orc"

# Write the DataFrame to the ORC file
spark_df.write.mode("overwrite").orc(orc_file_path)

# Stop the Spark session
spark.stop()

print(f"DataFrame successfully written to {orc_file_path}")