1
+ {
2
+ "nbformat" : 4 ,
3
+ "nbformat_minor" : 0 ,
4
+ "metadata" : {
5
+ "colab" : {
6
+ "provenance" : [],
7
+ "authorship_tag" : " ABX9TyMGUy7O01Iif+5ned4ITEW2" ,
8
+ "include_colab_link" : true
9
+ },
10
+ "kernelspec" : {
11
+ "name" : " python3" ,
12
+ "display_name" : " Python 3"
13
+ },
14
+ "language_info" : {
15
+ "name" : " python"
16
+ }
17
+ },
18
+ "cells" : [
19
+ {
20
+ "cell_type" : " markdown" ,
21
+ "metadata" : {
22
+ "id" : " view-in-github" ,
23
+ "colab_type" : " text"
24
+ },
25
+ "source" : [
26
+ " <a href=\" https://colab.research.google.com/github/cvelac4/Algorithm-and-Leetcode/blob/master/PySpark.ipynb\" target=\" _parent\" ><img src=\" https://colab.research.google.com/assets/colab-badge.svg\" alt=\" Open In Colab\" /></a>"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type" : " markdown" ,
31
+ "source" : [
32
+ " # Title and Explenation"
33
+ ],
34
+ "metadata" : {
35
+ "id" : " sGOX0xNhmvIN"
36
+ }
37
+ },
38
+ {
39
+ "cell_type" : " code" ,
40
+ "execution_count" : 1 ,
41
+ "metadata" : {
42
+ "id" : " N6iVXJp7jQME"
43
+ },
44
+ "outputs" : [],
45
+ "source" : [
46
+ " # Predictions of Suporstore data using Advanced ML PySpark.\n "
47
+ ]
48
+ },
49
+ {
50
+ "cell_type" : " markdown" ,
51
+ "source" : [
52
+ " # Download Pyspark"
53
+ ],
54
+ "metadata" : {
55
+ "id" : " sJDMbsjkmhJN"
56
+ }
57
+ },
58
+ {
59
+ "cell_type" : " code" ,
60
+ "source" : [
61
+ " !pip install PySpark"
62
+ ],
63
+ "metadata" : {
64
+ "colab" : {
65
+ "base_uri" : " https://localhost:8080/"
66
+ },
67
+ "id" : " DFAon71alA3W" ,
68
+ "outputId" : " d85f7c64-1165-4048-9a15-ee1da3cf1969"
69
+ },
70
+ "execution_count" : 2 ,
71
+ "outputs" : [
72
+ {
73
+ "output_type" : " stream" ,
74
+ "name" : " stdout" ,
75
+ "text" : [
76
+ " Requirement already satisfied: PySpark in /usr/local/lib/python3.11/dist-packages (3.5.4)\n " ,
77
+ " Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.11/dist-packages (from PySpark) (0.10.9.7)\n "
78
+ ]
79
+ }
80
+ ]
81
+ },
82
+ {
83
+ "cell_type" : " markdown" ,
84
+ "source" : [
85
+ " # Create a Session"
86
+ ],
87
+ "metadata" : {
88
+ "id" : " BvzJXcs7m5JW"
89
+ }
90
+ },
91
+ {
92
+ "cell_type" : " code" ,
93
+ "source" : [
94
+ " from pyspark.sql import SparkSession\n " ,
95
+ " \n " ,
96
+ " spark = SparkSession.builder.appName('SalesForecasting').getOrCreate()"
97
+ ],
98
+ "metadata" : {
99
+ "id" : " bMpAL_cPmfH8"
100
+ },
101
+ "execution_count" : 3 ,
102
+ "outputs" : []
103
+ },
104
+ {
105
+ "cell_type" : " markdown" ,
106
+ "source" : [
107
+ " #Load and Explore the Data\n "
108
+ ],
109
+ "metadata" : {
110
+ "id" : " hcGpdLwapX6h"
111
+ }
112
+ },
113
+ {
114
+ "cell_type" : " code" ,
115
+ "source" : [
116
+ " #data\n " ,
117
+ " path = '/content/1740463998446_a2368e63c8e87f60.csv'\n " ,
118
+ " df = spark.read.csv(path, header=True, inferSchema=True )\n " ,
119
+ " \n " ,
120
+ " #Display the Schema\n " ,
121
+ " df.printSchema()\n " ,
122
+ " \n " ,
123
+ " #View the sample data\n " ,
124
+ " df.show()\n "
125
+ ],
126
+ "metadata" : {
127
+ "colab" : {
128
+ "base_uri" : " https://localhost:8080/"
129
+ },
130
+ "id" : " m_uZfJ20pfGn" ,
131
+ "outputId" : " f5911b7c-275f-42d5-9072-f0dd910b0dfa"
132
+ },
133
+ "execution_count" : 5 ,
134
+ "outputs" : [
135
+ {
136
+ "output_type" : " stream" ,
137
+ "name" : " stdout" ,
138
+ "text" : [
139
+ " root\n " ,
140
+ " |-- ID: integer (nullable = true)\n " ,
141
+ " |-- Order_id: string (nullable = true)\n " ,
142
+ " |-- Order_Date: string (nullable = true)\n " ,
143
+ " |-- Ship _Date: date (nullable = true)\n " ,
144
+ " |-- Ship_Mode: string (nullable = true)\n " ,
145
+ " |-- Customer_id: string (nullable = true)\n " ,
146
+ " |-- Customer_Name: string (nullable = true)\n " ,
147
+ " |-- Segment: string (nullable = true)\n " ,
148
+ " |-- Country: string (nullable = true)\n " ,
149
+ " |-- City: string (nullable = true)\n " ,
150
+ " |-- State: string (nullable = true)\n " ,
151
+ " |-- Postal_Code: integer (nullable = true)\n " ,
152
+ " |-- Region: string (nullable = true)\n " ,
153
+ " |-- Product_ ID: string (nullable = true)\n " ,
154
+ " |-- Category: string (nullable = true)\n " ,
155
+ " |-- Sub_Category: string (nullable = true)\n " ,
156
+ " |-- Product_Name: string (nullable = true)\n " ,
157
+ " |-- Sales: string (nullable = true)\n " ,
158
+ " |-- Quantity: string (nullable = true)\n " ,
159
+ " |-- Discount: string (nullable = true)\n " ,
160
+ " |-- Profit: double (nullable = true)\n " ,
161
+ " |-- user_id: double (nullable = true)\n " ,
162
+ " |-- state_id: double (nullable = true)\n " ,
163
+ " |-- order_s: string (nullable = true)\n " ,
164
+ " \n " ,
165
+ " +---+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+-------+--------+-------+\n " ,
166
+ " | ID| Order_id|Order_Date|Ship _Date| Ship_Mode|Customer_id| Customer_Name| Segment| Country| City| State|Postal_Code| Region| Product_ ID| Category|Sub_Category| Product_Name| Sales|Quantity|Discount| Profit|user_id|state_id|order_s|\n " ,
167
+ " +---+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+-------+--------+-------+\n " ,
168
+ " | 1|CA-2023-152156|2023-11-08|2023-11-11| Second Class| CG-12520| Claire Gute| Consumer|United States| Henderson| Kentucky| 42420| South|FUR-BO-10001798| Furniture| Bookcases|Bush Somerset Col...| 261.96| 2| 0| 41.9136| NULL| NULL| NULL|\n " ,
169
+ " | 2|CA-2023-152156|2023-11-08|2023-11-11| Second Class| CG-12520| Claire Gute| Consumer|United States| Henderson| Kentucky| 42420| South|FUR-CH-10000454| Furniture| Chairs|Hon Deluxe Fabric...| 731.94| 3| 0| 219.582| 1.0| NULL| NULL|\n " ,
170
+ " | 3|CA-2023-138688|2023-06-12|2023-06-16| Second Class| DV-13045| Darrin Van Huff| Corporate|United States| Los Angeles| California| 90036| West|OFF-LA-10000240|Office Supplies| Labels|Self-Adhesive Add...| 14.62| 2| 0| 6.8714| NULL| NULL| NULL|\n " ,
171
+ " | 4|US-2022-108966|2022-10-11|2022-10-18|Standard Class| SO-20335| Sean O'Donnell| Consumer|United States|Fort Lauderdale| Florida| 33311| South|FUR-TA-10000577| Furniture| Tables|Bretford CR4500 S...|957.5775| 5| 0.45|-383.031| NULL| NULL| NULL|\n " ,
172
+ " | 5|US-2022-108966|2022-10-11|2022-10-18|Standard Class| SO-20335| Sean O'Donnell| Consumer|United States|Fort Lauderdale| Florida| 33311| South|OFF-ST-10000760|Office Supplies| Storage|Eldon Fold 'N Rol...| 22.368| 2| 0.2| 2.5164| NULL| NULL| NULL|\n " ,
173
+ " | 6|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|FUR-FU-10001487| Furniture| Furnishings|Eldon Expressions...| 48.86| 7| 0| 14.1694| NULL| NULL| NULL|\n " ,
174
+ " | 7|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|OFF-AR-10002833|Office Supplies| Art| Newell 322| 7.28| 4| 0| 1.9656| NULL| 1.0| NULL|\n " ,
175
+ " | 8|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|TEC-PH-10002275| Technology| Phones|Mitel 5320 IP Pho...| 907.152| 6| 0.2| 90.7152| NULL| NULL| NULL|\n " ,
176
+ " | 9|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|OFF-BI-10003910|Office Supplies| Binders|DXL Angle-View Bi...| 18.504| 3| 0.2| 5.7825| NULL| 2.0| NULL|\n " ,
177
+ " | 10|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|OFF-AP-10002892|Office Supplies| Appliances|Belkin F5C206VTEL...| 114.9| 5| 0| 34.47| NULL| NULL| NULL|\n " ,
178
+ " | 11|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|FUR-TA-10001539| Furniture| Tables|Chromcraft Rectan...|1706.184| 9| 0.2| 85.3092| NULL| NULL| NULL|\n " ,
179
+ " | 12|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|TEC-PH-10002033| Technology| Phones|Konftel 250 Confe...| 911.424| 4| 0.2| 68.3568| NULL| NULL| NULL|\n " ,
180
+ " | 13|CA-2024-114412|2024-04-15|2024-04-20|Standard Class| AA-10480| Andrew Allen| Consumer|United States| Concord|North Carolina| 28027| South|OFF-PA-10002365|Office Supplies| Paper| Xerox 1967| 15.552| 3| 0.2| 5.4432| NULL| NULL| s|\n " ,
181
+ " | 14|CA-2023-161389|2023-12-05|2023-12-10|Standard Class| IM-15070| Irene Maddox| Consumer|United States| Seattle| Washington| 98103| West|OFF-BI-10003656|Office Supplies| Binders|Fellowes PB200 Pl...| 407.976| 3| 0.2|132.5922| NULL| NULL| NULL|\n " ,
182
+ " | 15|US-2022-118983|2022-11-22|2022-11-26|Standard Class| HP-14815| Harold Pawlan|Home Office|United States| Fort Worth| Texas| 76106|Central|OFF-AP-10002311|Office Supplies| Appliances|Holmes Replacemen...| 68.81| 5| 0.8|-123.858| NULL| NULL| NULL|\n " ,
183
+ " | 16|US-2022-118983|2022-11-22|2022-11-26|Standard Class| HP-14815| Harold Pawlan|Home Office|United States| Fort Worth| Texas| 76106|Central|OFF-BI-10000756|Office Supplies| Binders|Storex DuraTech R...| 2.544| 3| 0.8| -3.816| NULL| NULL| NULL|\n " ,
184
+ " | 17|CA-2021-105893|2021-11-11|2021-11-18|Standard Class| PK-19075| Pete Kriz| Consumer|United States| Madison| Wisconsin| 53711|Central|OFF-ST-10004186|Office Supplies| Storage|\" Stur-D-Stor Shel...| 665.88| 6| 0| 13.3176| NULL| NULL| NULL|\n " ,
185
+ " | 18|CA-2021-167164|2021-05-13|2021-05-15| Second Class| AG-10270| Alejandro Grove| Consumer|United States| West Jordan| Utah| 84084| West|OFF-ST-10000107|Office Supplies| Storage|Fellowes Super St...| 55.5| 2| 0| 9.99| NULL| NULL| s|\n " ,
186
+ " | 19|CA-2021-143336|2021-08-27|2021-09-01| Second Class| ZD-21925|Zuschuss Donatelli| Consumer|United States| San Francisco| California| 94109| West|OFF-AR-10003056|Office Supplies| Art| Newell 341| 8.56| 2| 0| 2.4824| NULL| NULL| NULL|\n " ,
187
+ " | 20|CA-2021-143336|2021-08-27|2021-09-01| Second Class| ZD-21925|Zuschuss Donatelli| Consumer|United States| San Francisco| California| 94109| West|TEC-PH-10001949| Technology| Phones|Cisco SPA 501G IP...| 213.48| 3| 0.2| 16.011| NULL| NULL| NULL|\n " ,
188
+ " +---+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+-------+--------+-------+\n " ,
189
+ " only showing top 20 rows\n " ,
190
+ " \n "
191
+ ]
192
+ }
193
+ ]
194
+ },
195
+ {
196
+ "cell_type" : " markdown" ,
197
+ "source" : [
198
+ " #Data Processing\n " ,
199
+ " \n " ,
200
+ " \n " ,
201
+ " \n " ,
202
+ " * Convert Data\n " ,
203
+ " * Handdle Missing\n " ,
204
+ " Aggregate on daily level\n " ,
205
+ " \n " ,
206
+ " \n " ,
207
+ " \n " ,
208
+ " \n " ,
209
+ " \n "
210
+ ],
211
+ "metadata" : {
212
+ "id" : " V9V9fdGZvSl4"
213
+ }
214
+ },
215
+ {
216
+ "cell_type" : " code" ,
217
+ "source" : [
218
+ " #convert Order_data to data type\n " ,
219
+ " from pyspark.sql.functions import to_date, col,sum, dayofmonth, month, year,lag\n " ,
220
+ " \n " ,
221
+ " df = df.withColumn('Order_Date', col('Order_Date').cast('date') )\n " ,
222
+ " \n " ,
223
+ " df.printSchema()\n " ,
224
+ " \n " ,
225
+ " #Aggregate\n " ,
226
+ " daily_sales = df.groupBy('Order_Date').agg(sum('Sales').alias('Daily_Sales'))\n " ,
227
+ " \n " ,
228
+ " daily_sales.show()"
229
+ ],
230
+ "metadata" : {
231
+ "colab" : {
232
+ "base_uri" : " https://localhost:8080/"
233
+ },
234
+ "id" : " McHKdq5sv-VT" ,
235
+ "outputId" : " 645d537b-68c9-4527-dbe5-95d7b6cbca4d"
236
+ },
237
+ "execution_count" : 8 ,
238
+ "outputs" : [
239
+ {
240
+ "output_type" : " stream" ,
241
+ "name" : " stdout" ,
242
+ "text" : [
243
+ " root\n " ,
244
+ " |-- ID: integer (nullable = true)\n " ,
245
+ " |-- Order_id: string (nullable = true)\n " ,
246
+ " |-- Order_Date: date (nullable = true)\n " ,
247
+ " |-- Ship _Date: date (nullable = true)\n " ,
248
+ " |-- Ship_Mode: string (nullable = true)\n " ,
249
+ " |-- Customer_id: string (nullable = true)\n " ,
250
+ " |-- Customer_Name: string (nullable = true)\n " ,
251
+ " |-- Segment: string (nullable = true)\n " ,
252
+ " |-- Country: string (nullable = true)\n " ,
253
+ " |-- City: string (nullable = true)\n " ,
254
+ " |-- State: string (nullable = true)\n " ,
255
+ " |-- Postal_Code: integer (nullable = true)\n " ,
256
+ " |-- Region: string (nullable = true)\n " ,
257
+ " |-- Product_ ID: string (nullable = true)\n " ,
258
+ " |-- Category: string (nullable = true)\n " ,
259
+ " |-- Sub_Category: string (nullable = true)\n " ,
260
+ " |-- Product_Name: string (nullable = true)\n " ,
261
+ " |-- Sales: string (nullable = true)\n " ,
262
+ " |-- Quantity: string (nullable = true)\n " ,
263
+ " |-- Discount: string (nullable = true)\n " ,
264
+ " |-- Profit: double (nullable = true)\n " ,
265
+ " |-- user_id: double (nullable = true)\n " ,
266
+ " |-- state_id: double (nullable = true)\n " ,
267
+ " |-- order_s: string (nullable = true)\n " ,
268
+ " \n " ,
269
+ " +----------+------------------+\n " ,
270
+ " |Order_Date| Daily_Sales|\n " ,
271
+ " +----------+------------------+\n " ,
272
+ " |2021-08-27| 2070.13|\n " ,
273
+ " |2024-09-18|1454.7299999999998|\n " ,
274
+ " |2021-06-22| 1975.498|\n " ,
275
+ " |2022-03-28| 243.344|\n " ,
276
+ " |2022-07-31| 3712.162|\n " ,
277
+ " |2023-07-15| 380.2|\n " ,
278
+ " |2021-10-11| 1381.164|\n " ,
279
+ " |2021-01-27| 426.67|\n " ,
280
+ " |2023-11-08| 993.9000000000001|\n " ,
281
+ " |2024-08-27|5992.0779999999995|\n " ,
282
+ " |2022-11-29|2760.1680000000006|\n " ,
283
+ " |2024-06-04| 279.414|\n " ,
284
+ " |2024-06-12| 1679.968|\n " ,
285
+ " |2021-11-25| 4415.695000000001|\n " ,
286
+ " |2022-12-25| 4204.968000000001|\n " ,
287
+ " |2023-05-22|1799.2000000000003|\n " ,
288
+ " |2023-09-14| 1137.338|\n " ,
289
+ " |2021-10-02| 588.736|\n " ,
290
+ " |2022-08-02| 1290.478|\n " ,
291
+ " |2022-07-27| 29.97|\n " ,
292
+ " +----------+------------------+\n " ,
293
+ " only showing top 20 rows\n " ,
294
+ " \n "
295
+ ]
296
+ }
297
+ ]
298
+ }
299
+ ]
300
+ }
0 commit comments