In [16]:
import findspark
findspark.init()
# Create SparkContext
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

spark = SparkSession.builder \
.master("local[4]") \
.appName("Date Time Ops") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()

# Read data

In [17]:
df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",";") \
.csv("C:\\Users\\furkan\\spark\\Datasets\\OnlineRetail.csv") \
.select("InvoiceDate").distinct()

In [18]:
df.show(5)

+----------------+
|     InvoiceDate|
+----------------+
| 3.12.2010 16:50|
| 7.12.2010 12:28|
| 8.12.2010 15:02|
|10.12.2010 09:53|
|12.12.2010 13:32|
+----------------+
only showing top 5 rows



In [19]:
df.show(15)

+----------------+
|     InvoiceDate|
+----------------+
| 3.12.2010 16:50|
| 7.12.2010 12:28|
| 8.12.2010 15:02|
|10.12.2010 09:53|
|12.12.2010 13:32|
|15.12.2010 13:21|
|16.12.2010 08:41|
|17.12.2010 09:52|
| 9.01.2011 11:43|
|11.01.2011 11:38|
|16.01.2011 15:50|
|25.01.2011 17:06|
|27.01.2011 12:10|
|28.01.2011 12:19|
|31.01.2011 12:16|
+----------------+
only showing top 15 rows



In [74]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType, DateType,DoubleType

manual_schema = StructType([
    StructField("InvoiceNo", StringType(), True),
    StructField("StockCode", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("InvoiceDate", DateType(), True),
    StructField("UnitPrice", FloatType(), True),
    StructField("CustomerID", IntegerType(), True),
    StructField("Country", StringType(), True)

])

In [75]:
df2 = spark.read \
.option("header","True") \
.option("sep",";") \
.schema(manual_schema) \
.csv("C:\\Users\\furkan\\spark\\Datasets\\OnlineRetail.csv")

In [77]:
from pyspark.sql import functions as F

df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",";") \
.csv("C:\\Users\\furkan\\spark\\Datasets\\OnlineRetail.csv") \
.withColumn("UnitPrice",F.regexp_replace(F.col("UnitPrice"), ",","."))

In [78]:
df.show()

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|1.12.2010 08:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|1.12.2010 08:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|1.12.2010 08:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|1.12.2010 08:

In [79]:
mevcut_format = 'dd.MM.yyyy HH:mm'

# Date/Heure Operations

In [84]:
from pyspark.sql import functions as F

dataframe2 = df.withColumn("InvoiceDate", F.trim(F.col("InvoiceDate"))) \
.withColumn("normal_tarih", F.to_date(F.col("InvoiceDate"), mevcut_format)) \
.withColumn("standart_ts", F.to_timestamp(F.col("InvoiceDate"), mevcut_format))