<a href="https://colab.research.google.com/github/AdvaySharma147/College_Experiments/blob/main/Big_Data_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Experiment 5

**Installation**

In [None]:
!pip install pyspark



In [None]:
# Spark SQL
!pip install pyspark[sql]
# pandas API on Spark
!pip install pyspark[pandas_on_spark] plotly  # to plot your data, you can install plotly together.
# Spark Connect
!pip install pyspark[connect]



In [None]:
PYSPARK_RELEASE_MIRROR= 'http://mirror.apache-kr.org'
PYSPARK_HADOOP_VERSION=3
!pip install pyspark -v

Using pip 24.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)


In [None]:
!pip install 'pyarrow>=4.0.0' --prefer-binary



**Dataframe Creation**

In [None]:
from pyspark.sql import SparkSession

# Build Spark session with local master
spark = SparkSession.builder.master("local[*]").appName("SparkSessionExample").getOrCreate()


In [None]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

In [None]:
df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])

In [None]:
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [None]:
df = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')

In [None]:
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [None]:
pandas_df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
    'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})

In [None]:
df = spark.createDataFrame(pandas_df)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [None]:
# All DataFrames above result same.
df.show()
df.printSchema()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



**Viewing Data**

In [None]:
df.show(1)

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+
only showing top 1 row



In [None]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
df

a,b,c,d,e
1,2.0,string1,2000-01-01,2000-01-01 12:00:00
2,3.0,string2,2000-02-01,2000-01-02 12:00:00
3,4.0,string3,2000-03-01,2000-01-03 12:00:00


In [None]:
df.show(1, vertical=True)

-RECORD 0------------------
 a   | 1                   
 b   | 2.0                 
 c   | string1             
 d   | 2000-01-01          
 e   | 2000-01-01 12:00:00 
only showing top 1 row



In [None]:
df.columns

['a', 'b', 'c', 'd', 'e']

In [None]:
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [None]:
df.select("a", "b", "c").describe().show()

+-------+---+---+-------+
|summary|  a|  b|      c|
+-------+---+---+-------+
|  count|  3|  3|      3|
|   mean|2.0|3.0|   NULL|
| stddev|1.0|1.0|   NULL|
|    min|  1|2.0|string1|
|    max|  3|4.0|string3|
+-------+---+---+-------+



In [None]:
df.collect()

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)),
 Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [None]:
df.take(1)

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))]

In [None]:
df.toPandas()

Unnamed: 0,a,b,c,d,e
0,1,2.0,string1,2000-01-01,2000-01-01 12:00:00
1,2,3.0,string2,2000-02-01,2000-01-02 12:00:00
2,3,4.0,string3,2000-03-01,2000-01-03 12:00:00


**Selecting and Accessing Data**

In [None]:
df.a

Column<'a'>

In [None]:
from pyspark.sql import Column
from pyspark.sql.functions import upper
type(df.c) == type(upper(df.c)) == type(df.c.isNull())

True

In [None]:
df.select(df.c).show()

+-------+
|      c|
+-------+
|string1|
|string2|
|string3|
+-------+



In [None]:
df.withColumn('upper_c', upper(df.c)).show()

+---+---+-------+----------+-------------------+-------+
|  a|  b|      c|         d|                  e|upper_c|
+---+---+-------+----------+-------------------+-------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|STRING1|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|STRING2|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|STRING3|
+---+---+-------+----------+-------------------+-------+



In [None]:
df.filter(df.a == 1).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+



**Applying a Function**

In [None]:
import pandas as pd
from pyspark.sql.functions import pandas_udf
@pandas_udf('long')
def pandas_plus_one(series: pd.Series) -> pd.Series:
    # Simply plus one by using pandas Series.
    return series + 1
df.select(pandas_plus_one(df.a)).show()

+------------------+
|pandas_plus_one(a)|
+------------------+
|                 2|
|                 3|
|                 4|
+------------------+



In [None]:
def pandas_filter_func(iterator):
    for pandas_df in iterator:
        yield pandas_df[pandas_df.a == 1]
df.mapInPandas(pandas_filter_func, schema=df.schema).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+



**Grouping Data**

In [None]:
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [None]:
df.groupby('color').avg().show()

+-----+-------+-------+
|color|avg(v1)|avg(v2)|
+-----+-------+-------+
|  red|    4.8|   48.0|
| blue|    3.0|   30.0|
|black|    6.0|   60.0|
+-----+-------+-------+



In [None]:
def plus_mean(pandas_df):
    return pandas_df.assign(v1=pandas_df.v1 - pandas_df.v1.mean())
df.groupby('color').applyInPandas(plus_mean, schema=df.schema).show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  0| 60|
| blue|banana| -1| 20|
| blue| grape|  1| 40|
|  red|banana| -3| 10|
|  red|carrot| -1| 30|
|  red|carrot|  0| 50|
|  red|banana|  2| 70|
|  red| grape|  3| 80|
+-----+------+---+---+



In [None]:
df1 = spark.createDataFrame(
    [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
    ('time', 'id', 'v1'))

In [None]:
df2 = spark.createDataFrame(
    [(20000101, 1, 'x'), (20000101, 2, 'y')],
    ('time', 'id', 'v2'))

In [None]:
def merge_ordered(l, r):
    return pd.merge_ordered(l, r)
df1.groupby('id').cogroup(df2.groupby('id')).applyInPandas(
    merge_ordered, schema='time int, id int, v1 double, v2 string').show()

+--------+---+---+----+
|    time| id| v1|  v2|
+--------+---+---+----+
|20000101|  1|1.0|   x|
|20000102|  1|3.0|NULL|
|20000101|  2|2.0|   y|
|20000102|  2|4.0|NULL|
+--------+---+---+----+



**Extracting Data in Various Formats**

In [None]:
df.write.csv('foo.csv', header=True, mode='overwrite') # add mode='overwrite' to overwrite the existing file.
spark.read.csv('foo.csv', header=True).show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
+-----+------+---+---+



In [None]:
df.write.parquet('bar.parquet', mode='overwrite') # add mode='overwrite' to overwrite the existing file.
spark.read.parquet('bar.parquet').show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
+-----+------+---+---+



In [None]:
df.write.orc('zoo.orc', mode='overwrite') # add mode='overwrite' to overwrite the existing file.
spark.read.orc('zoo.orc').show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
+-----+------+---+---+



**Working with SQL**

In [None]:
df.createOrReplaceTempView("tableA")
spark.sql("SELECT count(*) from tableA").show()

+--------+
|count(1)|
+--------+
|       8|
+--------+



In [None]:
@pandas_udf("integer")
def add_one(s: pd.Series) -> pd.Series:
    return s + 1
spark.udf.register("add_one", add_one)
spark.sql("SELECT add_one(v1) FROM tableA").show()

+-----------+
|add_one(v1)|
+-----------+
|          2|
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
+-----------+



In [None]:
from pyspark.sql.functions import expr
df.selectExpr('add_one(v1)').show()
df.select(expr('count(*)') > 0).show()

+-----------+
|add_one(v1)|
+-----------+
|          2|
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
+-----------+

+--------------+
|(count(1) > 0)|
+--------------+
|          true|
+--------------+



**Pandas API on Spark**

In [None]:
import pandas as pd
import numpy as np
import pyspark.pandas as ps
from pyspark.sql import SparkSession



In [None]:
s = ps.Series([1, 3, 5, np.nan, 6, 8])

In [None]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [None]:
psdf = ps.DataFrame(
    {'a': [1, 2, 3, 4, 5, 6],
     'b': [100, 200, 300, 400, 500, 600],
     'c': ["one", "two", "three", "four", "five", "six"]},
    index=[10, 20, 30, 40, 50, 60])

In [None]:
psdf

Unnamed: 0,a,b,c
10,1,100,one
20,2,200,two
30,3,300,three
40,4,400,four
50,5,500,five
60,6,600,six


In [None]:
dates = pd.date_range('20130101', periods=6)

In [None]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [None]:
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [None]:
pdf

Unnamed: 0,A,B,C,D
2013-01-01,-0.017361,-1.164056,1.145029,0.7215
2013-01-02,-1.250208,0.878592,0.514172,0.269518
2013-01-03,-0.291204,1.434118,1.073136,0.652087
2013-01-04,-1.722127,-0.238816,-1.305321,-1.476502
2013-01-05,-1.245518,-0.080849,-0.505886,0.578456
2013-01-06,0.462403,1.635805,0.069052,0.993671


In [None]:
psdf = ps.from_pandas(pdf)

In [None]:
type(psdf)

In [None]:
psdf

Unnamed: 0,A,B,C,D
2013-01-01,-0.017361,-1.164056,1.145029,0.7215
2013-01-02,-1.250208,0.878592,0.514172,0.269518
2013-01-03,-0.291204,1.434118,1.073136,0.652087
2013-01-04,-1.722127,-0.238816,-1.305321,-1.476502
2013-01-05,-1.245518,-0.080849,-0.505886,0.578456
2013-01-06,0.462403,1.635805,0.069052,0.993671


In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
sdf = spark.createDataFrame(pdf)

In [None]:
sdf.show()

+--------------------+--------------------+-------------------+-------------------+
|                   A|                   B|                  C|                  D|
+--------------------+--------------------+-------------------+-------------------+
|-0.01736131833720...| -1.1640558347311447|  1.145029382979696| 0.7214999173263702|
| -1.2502078519238178|  0.8785923262029184| 0.5141716858010741|0.26951802457773427|
|-0.29120447528183324|   1.434118183251572| 1.0731355370894498| 0.6520869351252626|
| -1.7221268937472927|   -0.23881649576441| -1.305320727533831| -1.476502372368093|
| -1.2455176630325326|-0.08084867363311464|  -0.50588557550903|   0.57845572051587|
| 0.46240265289987276|  1.6358049341440046|0.06905171810162285|  0.993671038562591|
+--------------------+--------------------+-------------------+-------------------+



In [None]:
psdf = sdf.pandas_api()

In [None]:
psdf

Unnamed: 0,A,B,C,D
0,-0.017361,-1.164056,1.145029,0.7215
1,-1.250208,0.878592,0.514172,0.269518
2,-0.291204,1.434118,1.073136,0.652087
3,-1.722127,-0.238816,-1.305321,-1.476502
4,-1.245518,-0.080849,-0.505886,0.578456
5,0.462403,1.635805,0.069052,0.993671


In [None]:
psdf.dtypes

Unnamed: 0,0
A,float64
B,float64
C,float64
D,float64


In [None]:
psdf.head()

Unnamed: 0,A,B,C,D
0,-0.017361,-1.164056,1.145029,0.7215
1,-1.250208,0.878592,0.514172,0.269518
2,-0.291204,1.434118,1.073136,0.652087
3,-1.722127,-0.238816,-1.305321,-1.476502
4,-1.245518,-0.080849,-0.505886,0.578456


In [None]:
psdf.index

Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [None]:
psdf.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [None]:
psdf.to_numpy()



array([[-0.01736132, -1.16405583,  1.14502938,  0.72149992],
       [-1.25020785,  0.87859233,  0.51417169,  0.26951802],
       [-0.29120448,  1.43411818,  1.07313554,  0.65208694],
       [-1.72212689, -0.2388165 , -1.30532073, -1.47650237],
       [-1.24551766, -0.08084867, -0.50588558,  0.57845572],
       [ 0.46240265,  1.63580493,  0.06905172,  0.99367104]])

In [None]:
psdf.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.677336,0.410799,0.16503,0.289788
std,0.851618,1.087248,0.951654,0.896282
min,-1.722127,-1.164056,-1.305321,-1.476502
25%,-1.250208,-0.238816,-0.505886,0.269518
50%,-1.245518,-0.080849,0.069052,0.578456
75%,-0.017361,1.434118,1.073136,0.7215
max,0.462403,1.635805,1.145029,0.993671


In [None]:
psdf.T

Unnamed: 0,0,1,2,3,4,5
A,-0.017361,-1.250208,-0.291204,-1.722127,-1.245518,0.462403
B,-1.164056,0.878592,1.434118,-0.238816,-0.080849,1.635805
C,1.145029,0.514172,1.073136,-1.305321,-0.505886,0.069052
D,0.7215,0.269518,0.652087,-1.476502,0.578456,0.993671


In [None]:
psdf.sort_index(ascending=False)

Unnamed: 0,A,B,C,D
5,0.462403,1.635805,0.069052,0.993671
4,-1.245518,-0.080849,-0.505886,0.578456
3,-1.722127,-0.238816,-1.305321,-1.476502
2,-0.291204,1.434118,1.073136,0.652087
1,-1.250208,0.878592,0.514172,0.269518
0,-0.017361,-1.164056,1.145029,0.7215


In [None]:
psdf.sort_values(by='B')

Unnamed: 0,A,B,C,D
0,-0.017361,-1.164056,1.145029,0.7215
3,-1.722127,-0.238816,-1.305321,-1.476502
4,-1.245518,-0.080849,-0.505886,0.578456
1,-1.250208,0.878592,0.514172,0.269518
2,-0.291204,1.434118,1.073136,0.652087
5,0.462403,1.635805,0.069052,0.993671


In [None]:
pdf1 = pdf.reindex(index=dates[0:4], columns=list(pdf.columns) + ['E'])

In [None]:
pdf1.loc[dates[0]:dates[1], 'E'] = 1

In [None]:
psdf1 = ps.from_pandas(pdf1)

In [None]:
psdf1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.017361,-1.164056,1.145029,0.7215,1.0
2013-01-02,-1.250208,0.878592,0.514172,0.269518,1.0
2013-01-03,-0.291204,1.434118,1.073136,0.652087,
2013-01-04,-1.722127,-0.238816,-1.305321,-1.476502,


In [None]:
psdf1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.017361,-1.164056,1.145029,0.7215,1.0
2013-01-02,-1.250208,0.878592,0.514172,0.269518,1.0


In [None]:
psdf1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.017361,-1.164056,1.145029,0.7215,1.0
2013-01-02,-1.250208,0.878592,0.514172,0.269518,1.0
2013-01-03,-0.291204,1.434118,1.073136,0.652087,5.0
2013-01-04,-1.722127,-0.238816,-1.305321,-1.476502,5.0


In [None]:
psdf.mean()

A   -0.677336
B    0.410799
C    0.165030
D    0.289788
dtype: float64

In [None]:
prev = spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")  # Keep its default value.
ps.set_option("compute.default_index_type", "distributed")  # Use default index prevent overhead.
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings coming from Arrow optimizations.

In [None]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
%timeit ps.range(300000).to_pandas()

1.18 s ± 276 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", False)
%timeit ps.range(300000).to_pandas()

2.54 s ± 433 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
ps.reset_option("compute.default_index_type")
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", prev)  # Set its default value back.

In [None]:
psdf = ps.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})

In [None]:
psdf

Unnamed: 0,A,B,C,D
0,foo,one,0.684383,0.212783
1,bar,one,0.272596,-1.000022
2,foo,two,-1.842401,0.788304
3,bar,three,-0.486648,0.018531
4,foo,two,0.671318,-0.561513
5,bar,two,0.865087,1.263595
6,foo,one,0.628753,-0.096941
7,foo,three,-0.612697,0.349741


In [None]:
psdf.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.651035,0.282104
foo,-0.470645,0.692374


In [None]:
psdf.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
foo,one,1.313136,0.115841
foo,two,-1.171083,0.226792
bar,three,-0.486648,0.018531
bar,one,0.272596,-1.000022
foo,three,-0.612697,0.349741
bar,two,0.865087,1.263595


In [None]:
pser = pd.Series(np.random.randn(1000),
                 index=pd.date_range('1/1/2000', periods=1000))

In [None]:
psser = ps.Series(pser)

In [None]:
psser = psser.cummax()

In [None]:
psser.plot()

In [None]:
pdf = pd.DataFrame(np.random.randn(1000, 4), index=pser.index,
                   columns=['A', 'B', 'C', 'D'])

In [None]:
psdf = ps.from_pandas(pdf)

In [None]:
psdf = psdf.cummax()

In [None]:
psdf.plot()

In [None]:
psdf.to_csv('foo.csv')
ps.read_csv('foo.csv').head(10)

Unnamed: 0,A,B,C,D
0,0.600871,-0.224526,-0.740743,-1.31342
1,0.600871,0.52177,1.419803,-0.211882
2,0.600871,0.52177,1.419803,0.252133
3,0.600871,1.402552,1.419803,0.252133
4,0.600871,1.402552,1.419803,0.252133
5,0.821611,1.402552,1.419803,0.580975
6,0.821611,1.402552,1.419803,0.580975
7,0.821611,1.402552,1.419803,0.580975
8,0.821611,1.402552,1.419803,0.580975
9,0.888568,1.402552,1.95141,0.580975


In [None]:
psdf.to_parquet('bar.parquet')
ps.read_parquet('bar.parquet').head(10)

Unnamed: 0,A,B,C,D
0,0.600871,-0.224526,-0.740743,-1.31342
1,0.600871,0.52177,1.419803,-0.211882
2,0.600871,0.52177,1.419803,0.252133
3,0.600871,1.402552,1.419803,0.252133
4,0.600871,1.402552,1.419803,0.252133
5,0.821611,1.402552,1.419803,0.580975
6,0.821611,1.402552,1.419803,0.580975
7,0.821611,1.402552,1.419803,0.580975
8,0.821611,1.402552,1.419803,0.580975
9,0.888568,1.402552,1.95141,0.580975


In [None]:
psdf.to_spark_io('zoo.orc', format="orc")
ps.read_spark_io('zoo.orc', format="orc").head(10)

Unnamed: 0,A,B,C,D
0,0.600871,-0.224526,-0.740743,-1.31342
1,0.600871,0.52177,1.419803,-0.211882
2,0.600871,0.52177,1.419803,0.252133
3,0.600871,1.402552,1.419803,0.252133
4,0.600871,1.402552,1.419803,0.252133
5,0.821611,1.402552,1.419803,0.580975
6,0.821611,1.402552,1.419803,0.580975
7,0.821611,1.402552,1.419803,0.580975
8,0.821611,1.402552,1.419803,0.580975
9,0.888568,1.402552,1.95141,0.580975
