Skip to content

Commit 8bd5e35

Browse files
committed
Created using Colab. Using PySpark to show data from the .csv file.
1 parent 4ea4941 commit 8bd5e35

File tree

1 file changed

+300
-0
lines changed

1 file changed

+300
-0
lines changed

PySpark.ipynb

+300
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"provenance": [],
7+
"authorship_tag": "ABX9TyMGUy7O01Iif+5ned4ITEW2",
8+
"include_colab_link": true
9+
},
10+
"kernelspec": {
11+
"name": "python3",
12+
"display_name": "Python 3"
13+
},
14+
"language_info": {
15+
"name": "python"
16+
}
17+
},
18+
"cells": [
19+
{
20+
"cell_type": "markdown",
21+
"metadata": {
22+
"id": "view-in-github",
23+
"colab_type": "text"
24+
},
25+
"source": [
26+
"<a href=\"https://colab.research.google.com/github/cvelac4/Algorithm-and-Leetcode/blob/master/PySpark.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
27+
]
28+
},
29+
{
30+
"cell_type": "markdown",
31+
"source": [
32+
"# Title and Explenation"
33+
],
34+
"metadata": {
35+
"id": "sGOX0xNhmvIN"
36+
}
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 1,
41+
"metadata": {
42+
"id": "N6iVXJp7jQME"
43+
},
44+
"outputs": [],
45+
"source": [
46+
"# Predictions of Suporstore data using Advanced ML PySpark.\n"
47+
]
48+
},
49+
{
50+
"cell_type": "markdown",
51+
"source": [
52+
"# Download Pyspark"
53+
],
54+
"metadata": {
55+
"id": "sJDMbsjkmhJN"
56+
}
57+
},
58+
{
59+
"cell_type": "code",
60+
"source": [
61+
"!pip install PySpark"
62+
],
63+
"metadata": {
64+
"colab": {
65+
"base_uri": "https://localhost:8080/"
66+
},
67+
"id": "DFAon71alA3W",
68+
"outputId": "d85f7c64-1165-4048-9a15-ee1da3cf1969"
69+
},
70+
"execution_count": 2,
71+
"outputs": [
72+
{
73+
"output_type": "stream",
74+
"name": "stdout",
75+
"text": [
76+
"Requirement already satisfied: PySpark in /usr/local/lib/python3.11/dist-packages (3.5.4)\n",
77+
"Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.11/dist-packages (from PySpark) (0.10.9.7)\n"
78+
]
79+
}
80+
]
81+
},
82+
{
83+
"cell_type": "markdown",
84+
"source": [
85+
"# Create a Session"
86+
],
87+
"metadata": {
88+
"id": "BvzJXcs7m5JW"
89+
}
90+
},
91+
{
92+
"cell_type": "code",
93+
"source": [
94+
"from pyspark.sql import SparkSession\n",
95+
"\n",
96+
"spark = SparkSession.builder.appName('SalesForecasting').getOrCreate()"
97+
],
98+
"metadata": {
99+
"id": "bMpAL_cPmfH8"
100+
},
101+
"execution_count": 3,
102+
"outputs": []
103+
},
104+
{
105+
"cell_type": "markdown",
106+
"source": [
107+
"#Load and Explore the Data\n"
108+
],
109+
"metadata": {
110+
"id": "hcGpdLwapX6h"
111+
}
112+
},
113+
{
114+
"cell_type": "code",
115+
"source": [
116+
"#data\n",
117+
"path = '/content/1740463998446_a2368e63c8e87f60.csv'\n",
118+
"df = spark.read.csv(path, header=True, inferSchema=True )\n",
119+
"\n",
120+
"#Display the Schema\n",
121+
"df.printSchema()\n",
122+
"\n",
123+
"#View the sample data\n",
124+
"df.show()\n"
125+
],
126+
"metadata": {
127+
"colab": {
128+
"base_uri": "https://localhost:8080/"
129+
},
130+
"id": "m_uZfJ20pfGn",
131+
"outputId": "f5911b7c-275f-42d5-9072-f0dd910b0dfa"
132+
},
133+
"execution_count": 5,
134+
"outputs": [
135+
{
136+
"output_type": "stream",
137+
"name": "stdout",
138+
"text": [
139+
"root\n",
140+
" |-- ID: integer (nullable = true)\n",
141+
" |-- Order_id: string (nullable = true)\n",
142+
" |-- Order_Date: string (nullable = true)\n",
143+
" |-- Ship _Date: date (nullable = true)\n",
144+
" |-- Ship_Mode: string (nullable = true)\n",
145+
" |-- Customer_id: string (nullable = true)\n",
146+
" |-- Customer_Name: string (nullable = true)\n",
147+
" |-- Segment: string (nullable = true)\n",
148+
" |-- Country: string (nullable = true)\n",
149+
" |-- City: string (nullable = true)\n",
150+
" |-- State: string (nullable = true)\n",
151+
" |-- Postal_Code: integer (nullable = true)\n",
152+
" |-- Region: string (nullable = true)\n",
153+
" |-- Product_ ID: string (nullable = true)\n",
154+
" |-- Category: string (nullable = true)\n",
155+
" |-- Sub_Category: string (nullable = true)\n",
156+
" |-- Product_Name: string (nullable = true)\n",
157+
" |-- Sales: string (nullable = true)\n",
158+
" |-- Quantity: string (nullable = true)\n",
159+
" |-- Discount: string (nullable = true)\n",
160+
" |-- Profit: double (nullable = true)\n",
161+
" |-- user_id: double (nullable = true)\n",
162+
" |-- state_id: double (nullable = true)\n",
163+
" |-- order_s: string (nullable = true)\n",
164+
"\n",
165+
"+---+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+-------+--------+-------+\n",
166+
"| ID| Order_id|Order_Date|Ship _Date| Ship_Mode|Customer_id| Customer_Name| Segment| Country| City| State|Postal_Code| Region| Product_ ID| Category|Sub_Category| Product_Name| Sales|Quantity|Discount| Profit|user_id|state_id|order_s|\n",
167+
"+---+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+-------+--------+-------+\n",
168+
"| 1|CA-2023-152156|2023-11-08|2023-11-11| Second Class| CG-12520| Claire Gute| Consumer|United States| Henderson| Kentucky| 42420| South|FUR-BO-10001798| Furniture| Bookcases|Bush Somerset Col...| 261.96| 2| 0| 41.9136| NULL| NULL| NULL|\n",
169+
"| 2|CA-2023-152156|2023-11-08|2023-11-11| Second Class| CG-12520| Claire Gute| Consumer|United States| Henderson| Kentucky| 42420| South|FUR-CH-10000454| Furniture| Chairs|Hon Deluxe Fabric...| 731.94| 3| 0| 219.582| 1.0| NULL| NULL|\n",
170+
"| 3|CA-2023-138688|2023-06-12|2023-06-16| Second Class| DV-13045| Darrin Van Huff| Corporate|United States| Los Angeles| California| 90036| West|OFF-LA-10000240|Office Supplies| Labels|Self-Adhesive Add...| 14.62| 2| 0| 6.8714| NULL| NULL| NULL|\n",
171+
"| 4|US-2022-108966|2022-10-11|2022-10-18|Standard Class| SO-20335| Sean O'Donnell| Consumer|United States|Fort Lauderdale| Florida| 33311| South|FUR-TA-10000577| Furniture| Tables|Bretford CR4500 S...|957.5775| 5| 0.45|-383.031| NULL| NULL| NULL|\n",
172+
"| 5|US-2022-108966|2022-10-11|2022-10-18|Standard Class| SO-20335| Sean O'Donnell| Consumer|United States|Fort Lauderdale| Florida| 33311| South|OFF-ST-10000760|Office Supplies| Storage|Eldon Fold 'N Rol...| 22.368| 2| 0.2| 2.5164| NULL| NULL| NULL|\n",
173+
"| 6|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|FUR-FU-10001487| Furniture| Furnishings|Eldon Expressions...| 48.86| 7| 0| 14.1694| NULL| NULL| NULL|\n",
174+
"| 7|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|OFF-AR-10002833|Office Supplies| Art| Newell 322| 7.28| 4| 0| 1.9656| NULL| 1.0| NULL|\n",
175+
"| 8|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|TEC-PH-10002275| Technology| Phones|Mitel 5320 IP Pho...| 907.152| 6| 0.2| 90.7152| NULL| NULL| NULL|\n",
176+
"| 9|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|OFF-BI-10003910|Office Supplies| Binders|DXL Angle-View Bi...| 18.504| 3| 0.2| 5.7825| NULL| 2.0| NULL|\n",
177+
"| 10|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|OFF-AP-10002892|Office Supplies| Appliances|Belkin F5C206VTEL...| 114.9| 5| 0| 34.47| NULL| NULL| NULL|\n",
178+
"| 11|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|FUR-TA-10001539| Furniture| Tables|Chromcraft Rectan...|1706.184| 9| 0.2| 85.3092| NULL| NULL| NULL|\n",
179+
"| 12|CA-2021-115812|2021-06-09|2021-06-14|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| Los Angeles| California| 90032| West|TEC-PH-10002033| Technology| Phones|Konftel 250 Confe...| 911.424| 4| 0.2| 68.3568| NULL| NULL| NULL|\n",
180+
"| 13|CA-2024-114412|2024-04-15|2024-04-20|Standard Class| AA-10480| Andrew Allen| Consumer|United States| Concord|North Carolina| 28027| South|OFF-PA-10002365|Office Supplies| Paper| Xerox 1967| 15.552| 3| 0.2| 5.4432| NULL| NULL| s|\n",
181+
"| 14|CA-2023-161389|2023-12-05|2023-12-10|Standard Class| IM-15070| Irene Maddox| Consumer|United States| Seattle| Washington| 98103| West|OFF-BI-10003656|Office Supplies| Binders|Fellowes PB200 Pl...| 407.976| 3| 0.2|132.5922| NULL| NULL| NULL|\n",
182+
"| 15|US-2022-118983|2022-11-22|2022-11-26|Standard Class| HP-14815| Harold Pawlan|Home Office|United States| Fort Worth| Texas| 76106|Central|OFF-AP-10002311|Office Supplies| Appliances|Holmes Replacemen...| 68.81| 5| 0.8|-123.858| NULL| NULL| NULL|\n",
183+
"| 16|US-2022-118983|2022-11-22|2022-11-26|Standard Class| HP-14815| Harold Pawlan|Home Office|United States| Fort Worth| Texas| 76106|Central|OFF-BI-10000756|Office Supplies| Binders|Storex DuraTech R...| 2.544| 3| 0.8| -3.816| NULL| NULL| NULL|\n",
184+
"| 17|CA-2021-105893|2021-11-11|2021-11-18|Standard Class| PK-19075| Pete Kriz| Consumer|United States| Madison| Wisconsin| 53711|Central|OFF-ST-10004186|Office Supplies| Storage|\"Stur-D-Stor Shel...| 665.88| 6| 0| 13.3176| NULL| NULL| NULL|\n",
185+
"| 18|CA-2021-167164|2021-05-13|2021-05-15| Second Class| AG-10270| Alejandro Grove| Consumer|United States| West Jordan| Utah| 84084| West|OFF-ST-10000107|Office Supplies| Storage|Fellowes Super St...| 55.5| 2| 0| 9.99| NULL| NULL| s|\n",
186+
"| 19|CA-2021-143336|2021-08-27|2021-09-01| Second Class| ZD-21925|Zuschuss Donatelli| Consumer|United States| San Francisco| California| 94109| West|OFF-AR-10003056|Office Supplies| Art| Newell 341| 8.56| 2| 0| 2.4824| NULL| NULL| NULL|\n",
187+
"| 20|CA-2021-143336|2021-08-27|2021-09-01| Second Class| ZD-21925|Zuschuss Donatelli| Consumer|United States| San Francisco| California| 94109| West|TEC-PH-10001949| Technology| Phones|Cisco SPA 501G IP...| 213.48| 3| 0.2| 16.011| NULL| NULL| NULL|\n",
188+
"+---+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+-------+--------+-------+\n",
189+
"only showing top 20 rows\n",
190+
"\n"
191+
]
192+
}
193+
]
194+
},
195+
{
196+
"cell_type": "markdown",
197+
"source": [
198+
"#Data Processing\n",
199+
"\n",
200+
"\n",
201+
"\n",
202+
"* Convert Data\n",
203+
"* Handdle Missing\n",
204+
"Aggregate on daily level\n",
205+
"\n",
206+
"\n",
207+
"\n",
208+
"\n",
209+
"\n"
210+
],
211+
"metadata": {
212+
"id": "V9V9fdGZvSl4"
213+
}
214+
},
215+
{
216+
"cell_type": "code",
217+
"source": [
218+
"#convert Order_data to data type\n",
219+
"from pyspark.sql.functions import to_date, col,sum, dayofmonth, month, year,lag\n",
220+
"\n",
221+
"df = df.withColumn('Order_Date', col('Order_Date').cast('date') )\n",
222+
"\n",
223+
"df.printSchema()\n",
224+
"\n",
225+
"#Aggregate\n",
226+
"daily_sales = df.groupBy('Order_Date').agg(sum('Sales').alias('Daily_Sales'))\n",
227+
"\n",
228+
"daily_sales.show()"
229+
],
230+
"metadata": {
231+
"colab": {
232+
"base_uri": "https://localhost:8080/"
233+
},
234+
"id": "McHKdq5sv-VT",
235+
"outputId": "645d537b-68c9-4527-dbe5-95d7b6cbca4d"
236+
},
237+
"execution_count": 8,
238+
"outputs": [
239+
{
240+
"output_type": "stream",
241+
"name": "stdout",
242+
"text": [
243+
"root\n",
244+
" |-- ID: integer (nullable = true)\n",
245+
" |-- Order_id: string (nullable = true)\n",
246+
" |-- Order_Date: date (nullable = true)\n",
247+
" |-- Ship _Date: date (nullable = true)\n",
248+
" |-- Ship_Mode: string (nullable = true)\n",
249+
" |-- Customer_id: string (nullable = true)\n",
250+
" |-- Customer_Name: string (nullable = true)\n",
251+
" |-- Segment: string (nullable = true)\n",
252+
" |-- Country: string (nullable = true)\n",
253+
" |-- City: string (nullable = true)\n",
254+
" |-- State: string (nullable = true)\n",
255+
" |-- Postal_Code: integer (nullable = true)\n",
256+
" |-- Region: string (nullable = true)\n",
257+
" |-- Product_ ID: string (nullable = true)\n",
258+
" |-- Category: string (nullable = true)\n",
259+
" |-- Sub_Category: string (nullable = true)\n",
260+
" |-- Product_Name: string (nullable = true)\n",
261+
" |-- Sales: string (nullable = true)\n",
262+
" |-- Quantity: string (nullable = true)\n",
263+
" |-- Discount: string (nullable = true)\n",
264+
" |-- Profit: double (nullable = true)\n",
265+
" |-- user_id: double (nullable = true)\n",
266+
" |-- state_id: double (nullable = true)\n",
267+
" |-- order_s: string (nullable = true)\n",
268+
"\n",
269+
"+----------+------------------+\n",
270+
"|Order_Date| Daily_Sales|\n",
271+
"+----------+------------------+\n",
272+
"|2021-08-27| 2070.13|\n",
273+
"|2024-09-18|1454.7299999999998|\n",
274+
"|2021-06-22| 1975.498|\n",
275+
"|2022-03-28| 243.344|\n",
276+
"|2022-07-31| 3712.162|\n",
277+
"|2023-07-15| 380.2|\n",
278+
"|2021-10-11| 1381.164|\n",
279+
"|2021-01-27| 426.67|\n",
280+
"|2023-11-08| 993.9000000000001|\n",
281+
"|2024-08-27|5992.0779999999995|\n",
282+
"|2022-11-29|2760.1680000000006|\n",
283+
"|2024-06-04| 279.414|\n",
284+
"|2024-06-12| 1679.968|\n",
285+
"|2021-11-25| 4415.695000000001|\n",
286+
"|2022-12-25| 4204.968000000001|\n",
287+
"|2023-05-22|1799.2000000000003|\n",
288+
"|2023-09-14| 1137.338|\n",
289+
"|2021-10-02| 588.736|\n",
290+
"|2022-08-02| 1290.478|\n",
291+
"|2022-07-27| 29.97|\n",
292+
"+----------+------------------+\n",
293+
"only showing top 20 rows\n",
294+
"\n"
295+
]
296+
}
297+
]
298+
}
299+
]
300+
}

0 commit comments

Comments
 (0)