In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = (SparkSession.builder.appName("SplitFunctionTest").getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/18 17:18:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
people_data = [
    (1, "Andrew James", "1987-12-25", True, 140000, "model"),
    (2, "Linda Scott", "1990-05-14", False, 95000, "engineer"),
    (3, "Michael Brown", "1985-11-07", True, 120000, "teacher"),
    (4, "Jessica Wilson", "1992-03-22", False, 75000, "designer"),
    (5, "John Davis", "1980-08-19", True, 160000, "lawyer"),
    (6, "Emily Clark", "1995-02-10", False, 98000, "architect"),
    (7, "David Evans", "1983-07-15", True, 130000, "doctor"),
    (8, "Sophia Harris", "1988-11-30", False, 110000, "nurse"),
    (9, "Daniel Martinez", "1991-06-01", True, 145000, "consultant"),
    (10, "Olivia Thompson", "1986-09-12", False, 102000, "scientist")
]

people_schema = ("id","full name","birth_date","married","anual_salary","profession")

In [7]:
df_people = spark.createDataFrame(data=people_data, schema=people_schema)

In [8]:
df_people.show()

                                                                                

+---+---------------+----------+-------+------------+----------+
| id|      full name|birth_date|married|anual_salary|profession|
+---+---------------+----------+-------+------------+----------+
|  1|   Andrew James|1987-12-25|   true|      140000|     model|
|  2|    Linda Scott|1990-05-14|  false|       95000|  engineer|
|  3|  Michael Brown|1985-11-07|   true|      120000|   teacher|
|  4| Jessica Wilson|1992-03-22|  false|       75000|  designer|
|  5|     John Davis|1980-08-19|   true|      160000|    lawyer|
|  6|    Emily Clark|1995-02-10|  false|       98000| architect|
|  7|    David Evans|1983-07-15|   true|      130000|    doctor|
|  8|  Sophia Harris|1988-11-30|  false|      110000|     nurse|
|  9|Daniel Martinez|1991-06-01|   true|      145000|consultant|
| 10|Olivia Thompson|1986-09-12|  false|      102000| scientist|
+---+---------------+----------+-------+------------+----------+



## First method

In [9]:
from pyspark.sql.functions import split

In [22]:
df_split_1 = df_people.withColumn("first_name",split(df_people['full name'], ' ').getItem(0))\
                .withColumn("last_name",split(df_people['full name'], ' ').getItem(1))

df_split_1.show()

+---+---------------+----------+-------+------------+----------+----------+---------+
| id|      full name|birth_date|married|anual_salary|profession|first_name|last_name|
+---+---------------+----------+-------+------------+----------+----------+---------+
|  1|   Andrew James|1987-12-25|   true|      140000|     model|    Andrew|    James|
|  2|    Linda Scott|1990-05-14|  false|       95000|  engineer|     Linda|    Scott|
|  3|  Michael Brown|1985-11-07|   true|      120000|   teacher|   Michael|    Brown|
|  4| Jessica Wilson|1992-03-22|  false|       75000|  designer|   Jessica|   Wilson|
|  5|     John Davis|1980-08-19|   true|      160000|    lawyer|      John|    Davis|
|  6|    Emily Clark|1995-02-10|  false|       98000| architect|     Emily|    Clark|
|  7|    David Evans|1983-07-15|   true|      130000|    doctor|     David|    Evans|
|  8|  Sophia Harris|1988-11-30|  false|      110000|     nurse|    Sophia|   Harris|
|  9|Daniel Martinez|1991-06-01|   true|      145000|c

## Second method

In [17]:
split_col = split(df_people['full name'], ' ')

In [26]:
df_split_2 = df_people.withColumn("first_name",split_col.getItem(0)).withColumn("last_name",split_col.getItem(1))
df_split_2.show()

+---+---------------+----------+-------+------------+----------+----------+---------+
| id|      full name|birth_date|married|anual_salary|profession|first_name|last_name|
+---+---------------+----------+-------+------------+----------+----------+---------+
|  1|   Andrew James|1987-12-25|   true|      140000|     model|    Andrew|    James|
|  2|    Linda Scott|1990-05-14|  false|       95000|  engineer|     Linda|    Scott|
|  3|  Michael Brown|1985-11-07|   true|      120000|   teacher|   Michael|    Brown|
|  4| Jessica Wilson|1992-03-22|  false|       75000|  designer|   Jessica|   Wilson|
|  5|     John Davis|1980-08-19|   true|      160000|    lawyer|      John|    Davis|
|  6|    Emily Clark|1995-02-10|  false|       98000| architect|     Emily|    Clark|
|  7|    David Evans|1983-07-15|   true|      130000|    doctor|     David|    Evans|
|  8|  Sophia Harris|1988-11-30|  false|      110000|     nurse|    Sophia|   Harris|
|  9|Daniel Martinez|1991-06-01|   true|      145000|c

## Third method

In [20]:
split_date = split(df_people['birth_date'], '-')

df_split_date= df_people.select("id","full name","married",split_date.getItem(0).alias("year"))

df_split_date.show()

+---+---------------+-------+----+
| id|      full name|married|year|
+---+---------------+-------+----+
|  1|   Andrew James|   true|1987|
|  2|    Linda Scott|  false|1990|
|  3|  Michael Brown|   true|1985|
|  4| Jessica Wilson|  false|1992|
|  5|     John Davis|   true|1980|
|  6|    Emily Clark|  false|1995|
|  7|    David Evans|   true|1983|
|  8|  Sophia Harris|  false|1988|
|  9|Daniel Martinez|   true|1991|
| 10|Olivia Thompson|  false|1986|
+---+---------------+-------+----+



## Combine Method

In [32]:
df_split_combine = df_people.select("id","married",\
                                split(df_people['full name'], ' ').getItem(0).alias("first_name"),\
                                split(df_people['full name'], ' ').getItem(1).alias("last_name"),\
                                split(df_people['birth_date'], '-').getItem(0).alias("born_year"))

df_split_combine.show()

+---+-------+----------+---------+---------+
| id|married|first_name|last_name|born_year|
+---+-------+----------+---------+---------+
|  1|   true|    Andrew|    James|     1987|
|  2|  false|     Linda|    Scott|     1990|
|  3|   true|   Michael|    Brown|     1985|
|  4|  false|   Jessica|   Wilson|     1992|
|  5|   true|      John|    Davis|     1980|
|  6|  false|     Emily|    Clark|     1995|
|  7|   true|     David|    Evans|     1983|
|  8|  false|    Sophia|   Harris|     1988|
|  9|   true|    Daniel| Martinez|     1991|
| 10|  false|    Olivia| Thompson|     1986|
+---+-------+----------+---------+---------+



## Drop columns

In [37]:
df_split_drop = df_people.withColumn("first_name",split(df_people['full name'], ' ').getItem(0)).drop(df_people['full name'])
df_split_drop.show()

+---+----------+-------+------------+----------+----------+
| id|birth_date|married|anual_salary|profession|first_name|
+---+----------+-------+------------+----------+----------+
|  1|1987-12-25|   true|      140000|     model|    Andrew|
|  2|1990-05-14|  false|       95000|  engineer|     Linda|
|  3|1985-11-07|   true|      120000|   teacher|   Michael|
|  4|1992-03-22|  false|       75000|  designer|   Jessica|
|  5|1980-08-19|   true|      160000|    lawyer|      John|
|  6|1995-02-10|  false|       98000| architect|     Emily|
|  7|1983-07-15|   true|      130000|    doctor|     David|
|  8|1988-11-30|  false|      110000|     nurse|    Sophia|
|  9|1991-06-01|   true|      145000|consultant|    Daniel|
| 10|1986-09-12|  false|      102000| scientist|    Olivia|
+---+----------+-------+------------+----------+----------+

