| 
 | 1 | +# /// script  | 
 | 2 | +# requires-python = ">=3.11"  | 
 | 3 | +# dependencies = [  | 
 | 4 | +#     "marimo",  | 
 | 5 | +#     "matplotlib==3.10.1",  | 
 | 6 | +#     "numpy==1.26.4",  | 
 | 7 | +#     "pandas==2.2.3",  | 
 | 8 | +#     "plotly==6.0.1",  | 
 | 9 | +#     "pyarrow==20.0.0",  | 
 | 10 | +#     "pyspark==3.5.5",  | 
 | 11 | +#     "scikit-learn==1.6.1",  | 
 | 12 | +# ]  | 
 | 13 | +# ///  | 
 | 14 | + | 
 | 15 | +import marimo  | 
 | 16 | + | 
 | 17 | +__generated_with = "0.13.0"  | 
 | 18 | +app = marimo.App(width="medium")  | 
 | 19 | + | 
 | 20 | + | 
 | 21 | +@app.cell(hide_code=True)  | 
 | 22 | +def _(mo):  | 
 | 23 | +    mo.md(r"""## Motivation""")  | 
 | 24 | +    return  | 
 | 25 | + | 
 | 26 | + | 
 | 27 | +@app.cell(hide_code=True)  | 
 | 28 | +def _():  | 
 | 29 | +    import warnings  | 
 | 30 | + | 
 | 31 | +    warnings.filterwarnings("ignore", category=FutureWarning)  | 
 | 32 | +    return  | 
 | 33 | + | 
 | 34 | + | 
 | 35 | +@app.cell  | 
 | 36 | +def _():  | 
 | 37 | +    import pandas as pd  | 
 | 38 | + | 
 | 39 | +    pandas_df = pd.DataFrame({"value": [1, 2, 3, 4, 5]})  | 
 | 40 | +    print(pandas_df["value"].mean())  | 
 | 41 | +    return pandas_df, pd  | 
 | 42 | + | 
 | 43 | + | 
 | 44 | +@app.cell  | 
 | 45 | +def _():  | 
 | 46 | +    from pyspark.sql import SparkSession  | 
 | 47 | +    from pyspark.sql.functions import avg  | 
 | 48 | + | 
 | 49 | +    spark = SparkSession.builder.getOrCreate()  | 
 | 50 | + | 
 | 51 | +    spark_df = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,)], ["value"])  | 
 | 52 | +    spark_df.select(avg("value")).show()  | 
 | 53 | +    return (spark,)  | 
 | 54 | + | 
 | 55 | + | 
 | 56 | +@app.cell(hide_code=True)  | 
 | 57 | +def _(mo):  | 
 | 58 | +    mo.md(r"""## Basic Operations""")  | 
 | 59 | +    return  | 
 | 60 | + | 
 | 61 | + | 
 | 62 | +@app.cell  | 
 | 63 | +def _():  | 
 | 64 | +    import pyspark.pandas as ps  | 
 | 65 | + | 
 | 66 | +    ps_s = ps.Series([1, 3, 5, 6, 8])  | 
 | 67 | +    return (ps,)  | 
 | 68 | + | 
 | 69 | + | 
 | 70 | +@app.cell  | 
 | 71 | +def _(pd):  | 
 | 72 | +    import numpy as np  | 
 | 73 | + | 
 | 74 | +    ps_df = pd.DataFrame(  | 
 | 75 | +        {"id": np.arange(1, 1_000_001), "value": np.random.randn(1_000_000)}  | 
 | 76 | +    )  | 
 | 77 | +    return (ps_df,)  | 
 | 78 | + | 
 | 79 | + | 
 | 80 | +@app.cell  | 
 | 81 | +def _(pandas_df, ps):  | 
 | 82 | +    ps_df_from_pandas = ps.from_pandas(pandas_df)  | 
 | 83 | +    return  | 
 | 84 | + | 
 | 85 | + | 
 | 86 | +@app.cell(hide_code=True)  | 
 | 87 | +def _(mo):  | 
 | 88 | +    mo.md(r"""## Basic Operations""")  | 
 | 89 | +    return  | 
 | 90 | + | 
 | 91 | + | 
 | 92 | +@app.cell  | 
 | 93 | +def _(ps_df):  | 
 | 94 | +    ps_df.describe()  | 
 | 95 | +    return  | 
 | 96 | + | 
 | 97 | + | 
 | 98 | +@app.cell  | 
 | 99 | +def _(ps_df):  | 
 | 100 | +    # Display the summary of the DataFrame  | 
 | 101 | +    ps_df.info()  | 
 | 102 | + | 
 | 103 | +    return  | 
 | 104 | + | 
 | 105 | + | 
 | 106 | +@app.cell  | 
 | 107 | +def _(ps_df):  | 
 | 108 | +    ps_df.head()  | 
 | 109 | +    return  | 
 | 110 | + | 
 | 111 | + | 
 | 112 | +@app.cell  | 
 | 113 | +def _(ps_df):  | 
 | 114 | +    # Filter rows and drop any NaN values  | 
 | 115 | +    filtered_df = ps_df.where(ps_df.value > 0).dropna()  | 
 | 116 | +    filtered_df.head()  | 
 | 117 | + | 
 | 118 | +    return  | 
 | 119 | + | 
 | 120 | + | 
 | 121 | +@app.cell(hide_code=True)  | 
 | 122 | +def _(mo):  | 
 | 123 | +    mo.md(r"""## GroupBy""")  | 
 | 124 | +    return  | 
 | 125 | + | 
 | 126 | + | 
 | 127 | +@app.cell  | 
 | 128 | +def _(ps):  | 
 | 129 | +    ps_df_2 = ps.DataFrame(  | 
 | 130 | +        {"category": ["A", "B", "A", "C", "B"], "value": [10, 20, 15, 30, 25]}  | 
 | 131 | +    )  | 
 | 132 | +    return (ps_df_2,)  | 
 | 133 | + | 
 | 134 | + | 
 | 135 | +@app.cell  | 
 | 136 | +def _(ps_df_2):  | 
 | 137 | +    ps_df_2.groupby("category").value.mean()  | 
 | 138 | +    return  | 
 | 139 | + | 
 | 140 | + | 
 | 141 | +@app.cell(hide_code=True)  | 
 | 142 | +def _(mo):  | 
 | 143 | +    mo.md(r"""## Plotting""")  | 
 | 144 | +    return  | 
 | 145 | + | 
 | 146 | + | 
 | 147 | +@app.cell  | 
 | 148 | +def _(ps_df):  | 
 | 149 | +    ps_df["value"].plot.hist()  | 
 | 150 | +    return  | 
 | 151 | + | 
 | 152 | + | 
 | 153 | +@app.cell  | 
 | 154 | +def _(ps_df_2):  | 
 | 155 | +    ps_df_2.plot.bar(x="category", y="value")  | 
 | 156 | +    return  | 
 | 157 | + | 
 | 158 | + | 
 | 159 | +@app.cell(hide_code=True)  | 
 | 160 | +def _(mo):  | 
 | 161 | +    mo.md(r"""## Reading And Writing Data""")  | 
 | 162 | +    return  | 
 | 163 | + | 
 | 164 | + | 
 | 165 | +@app.cell  | 
 | 166 | +def _(ps, ps_df):  | 
 | 167 | +    ps_df.to_csv("output_data.csv", index=False)  | 
 | 168 | +    ps.read_csv("output_data.csv").head()  | 
 | 169 | +    return  | 
 | 170 | + | 
 | 171 | + | 
 | 172 | +@app.cell  | 
 | 173 | +def _(ps, ps_df):  | 
 | 174 | +    ps_df.to_parquet("output_data.parquet")  | 
 | 175 | +    ps.read_parquet("output_data.parquet").head()  | 
 | 176 | +    return  | 
 | 177 | + | 
 | 178 | + | 
 | 179 | +@app.cell(hide_code=True)  | 
 | 180 | +def _(mo):  | 
 | 181 | +    mo.md(r"""## Using Pandas API on Spark in Conjunction with Regular Pandas""")  | 
 | 182 | +    return  | 
 | 183 | + | 
 | 184 | + | 
 | 185 | +@app.cell  | 
 | 186 | +def _(ps):  | 
 | 187 | +    from sklearn.linear_model import LinearRegression  | 
 | 188 | + | 
 | 189 | +    # Create a large Pandas API on Spark DataFrame  | 
 | 190 | +    large_pdf_df = ps.DataFrame(  | 
 | 191 | +        {  | 
 | 192 | +            "feature1": range(1_000_000),  | 
 | 193 | +            "feature2": range(1_000_000, 2_000_000),  | 
 | 194 | +            "target": range(500_000, 1_500_000),  | 
 | 195 | +        }  | 
 | 196 | +    )  | 
 | 197 | +    print(f"Length of the original DataFrame: {len(large_pdf_df):,}")  | 
 | 198 | + | 
 | 199 | +    # Aggregate the data to a smaller size  | 
 | 200 | +    aggregated = large_pdf_df.groupby(large_pdf_df.feature1 // 10000).mean()  | 
 | 201 | +    print(f"Length of the aggregated DataFrame: {len(aggregated):,}")  | 
 | 202 | + | 
 | 203 | +    # Convert to pandas DataFrame  | 
 | 204 | +    small_pdf = aggregated.to_pandas()  | 
 | 205 | + | 
 | 206 | +    # Train a scikit-learn model  | 
 | 207 | +    model = LinearRegression()  | 
 | 208 | +    X = small_pdf[["feature1", "feature2"]]  | 
 | 209 | +    y = small_pdf["target"]  | 
 | 210 | +    model.fit(X, y)  | 
 | 211 | + | 
 | 212 | +    return  | 
 | 213 | + | 
 | 214 | + | 
 | 215 | +@app.cell(hide_code=True)  | 
 | 216 | +def _(mo):  | 
 | 217 | +    mo.md(r"""## Pandas API on Spark Query Execution Model""")  | 
 | 218 | +    return  | 
 | 219 | + | 
 | 220 | + | 
 | 221 | +@app.cell  | 
 | 222 | +def _(pandas_df):  | 
 | 223 | +    pandas_df["value"] = pandas_df["value"] + 1  # Operation executes immediately  | 
 | 224 | +    print(pandas_df)  | 
 | 225 | +    return  | 
 | 226 | + | 
 | 227 | + | 
 | 228 | +@app.cell  | 
 | 229 | +def _(ps_df):  | 
 | 230 | +    # Using Pandas API on Spark  | 
 | 231 | +    updated_psdf = ps_df.assign(a=ps_df["value"] + 1)  # Lazy operation  | 
 | 232 | +    print(updated_psdf.head())  # Triggers actual computation  | 
 | 233 | +    return  | 
 | 234 | + | 
 | 235 | + | 
 | 236 | +@app.cell(hide_code=True)  | 
 | 237 | +def _(mo):  | 
 | 238 | +    mo.md(r"""## Pandas API on Spark vs PySpark""")  | 
 | 239 | +    return  | 
 | 240 | + | 
 | 241 | + | 
 | 242 | +@app.cell  | 
 | 243 | +def _(spark):  | 
 | 244 | +    from pyspark.sql.functions import col  | 
 | 245 | + | 
 | 246 | +    pyspark_df = spark.createDataFrame([(1, 4), (2, 5), (3, 6)], ["col1", "col2"])  | 
 | 247 | +    pyspark_df.select((col("col1") + col("col2")).alias("sum")).show()  | 
 | 248 | +    return (col,)  | 
 | 249 | + | 
 | 250 | + | 
 | 251 | +@app.cell  | 
 | 252 | +def _(ps):  | 
 | 253 | +    pandas_spark_df = ps.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})  | 
 | 254 | +    (pandas_spark_df["col1"] + pandas_spark_df["col2"]).head()  | 
 | 255 | +    return (pandas_spark_df,)  | 
 | 256 | + | 
 | 257 | + | 
 | 258 | +@app.cell  | 
 | 259 | +def _(col, pandas_spark_df):  | 
 | 260 | +    # Convert Pandas API on Spark DataFrame to PySpark DataFrame  | 
 | 261 | +    spark_native_df = pandas_spark_df.to_spark()  | 
 | 262 | + | 
 | 263 | +    # Now you can use full PySpark functionality  | 
 | 264 | +    spark_native_df.select((col("col1") + col("col2")).alias("sum")).show()  | 
 | 265 | +    return  | 
 | 266 | + | 
 | 267 | + | 
 | 268 | +@app.cell  | 
 | 269 | +def _():  | 
 | 270 | +    import marimo as mo  | 
 | 271 | + | 
 | 272 | +    return (mo,)  | 
 | 273 | + | 
 | 274 | + | 
 | 275 | +if __name__ == "__main__":  | 
 | 276 | +    app.run()  | 
0 commit comments