<p><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/1e/UNAL_Logosimbolo.svg/583px-UNAL_Logosimbolo.svg.png" alt="" width="1280" height="300" /></p>


# CREATE SPARK DATAFRAME WAYS

There are multiple ways to create a DataFrame in Spark, depending on the source and structure of your data. Whether you're working with in-memory collections, external files, or structured data from databases, Spark provides flexible options to define and build DataFrames efficiently.

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
help(spark.createDataFrame)

## NATIVE PYTHON

### VARIABLE

In [0]:
age = 15
spark.createDataFrame([age], "integer").display()

### LIST

#### WAY A

In [0]:
elements = ["user_a", "user_b", "user_c"]
spark.createDataFrame(elements, "string").display()

#### WAY B

In [0]:
from pyspark.sql.types import StringType

elements = ["user_a", "user_b", "user_c"]
spark.createDataFrame(elements, StringType()).display()



### DICT

In [0]:
info = {
    "users": ["user_a", "user_b", "user_c"],
    "ages": [1,2,3],
    "name": "test"
}
spark.createDataFrame([info, info]).display()


### TUPLES

#### WAY A

In [0]:

elements = [
    (1, 'user_1'),
    (2, 'user_2'),
    (3, 'user_3')
]
spark.createDataFrame(elements, "id int, users string").display()

#### WAY B

In [0]:

elements = [
    (1, 'user_1'),
    (2, 'user_2'),
    (3, 'user_3')
]

schema = StructType(
    [
        StructField("id",  IntegerType()),
        StructField("name", StringType()),
        
    ]
)

spark.createDataFrame(elements, schema).display()

#### WAY C

In [0]:
columns = ["id", "user_name"]
elements = [
    (1, 'user_1'),
    (2, 'user_2'),
    (3, 'user_3')
]

spark.createDataFrame(elements, schema=columns).display()

## ROW

In [0]:
from pyspark.sql import Row

elements = [
  Row(id=1, name='user_1'),
  Row(id=2, name='user_2'),
  Row(id=3, name='user_2')
]

element = spark.createDataFrame(elements)
display(element)

In [0]:
element.collect()

## PANDAS

In [0]:
import pandas as pd

elements = pd.DataFrame(
    {
        "id": [1, 2, 3],
        "name": ["user_1", "user_2", "user_3"]
    }
)
spark.createDataFrame(elements).display()

## RDD

In [0]:
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

rdd = spark.sparkContext.parallelize([
    ("Alice", 25),
    ("Bob", 30),
    ("Charlie", 22)
])

df = spark.createDataFrame(rdd, schema)
df.printSchema()
df.show()