In [4]:
import findspark

In [5]:
findspark.init()

from pyspark.sql import SparkSession

spark =  SparkSession.builder.getOrCreate()

In [8]:
data = [(1,'slkfkjshfdkjlhdk','30000'),(2,'kdhflkdshflkhdlkjf','8987'),(3,'udgfiuiewrui','876876')]

schema = ['ID','Name','Salary']

df = spark.createDataFrame(data=data,schema=schema)

df.show(n=3,truncate=False)

+---+------------------+------+
|ID |Name              |Salary|
+---+------------------+------+
|1  |slkfkjshfdkjlhdk  |30000 |
|2  |kdhflkdshflkhdlkjf|8987  |
|3  |udgfiuiewrui      |876876|
+---+------------------+------+



In [9]:
df.printSchema()

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: string (nullable = true)



In [10]:
# help(df.withColumn)

Help on method withColumn in module pyspark.sql.dataframe:

withColumn(colName, col) method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` by adding a column or replacing the
    existing column that has the same name.
    
    The column expression must be an expression over this :class:`DataFrame`; attempting to add
    a column from some other :class:`DataFrame` will raise an error.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    colName : str
        string, name of the new column.
    col : :class:`Column`
        a :class:`Column` expression for the new column.
    
    Notes
    -----
    This method introduces a projection internally. Therefore, calling it multiple
    times, for instance, via loops in order to add multiple columns can generate big
    plans which can cause performance issues and even `StackOverflowException`.
    To avoid this, use :func:`select` with the multiple columns at once.
    
    Examples
    

In [11]:
from pyspark.sql.functions import col

df1 = df.withColumn(colName='Salary',col=col('Salary').cast('Integer'))

df1.show()

df1.printSchema()

+---+------------------+------+
| ID|              Name|Salary|
+---+------------------+------+
|  1|  slkfkjshfdkjlhdk| 30000|
|  2|kdhflkdshflkhdlkjf|  8987|
|  3|      udgfiuiewrui|876876|
+---+------------------+------+

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [12]:
df2 = df1.withColumn(colName='Salary',col=col('Salary')*2)

df2.show()

+---+------------------+-------+
| ID|              Name| Salary|
+---+------------------+-------+
|  1|  slkfkjshfdkjlhdk|  60000|
|  2|kdhflkdshflkhdlkjf|  17974|
|  3|      udgfiuiewrui|1753752|
+---+------------------+-------+



In [14]:
from pyspark.sql.functions import lit

df3 = df2.withColumn(colName='Country',col=lit('India'))

df3.show()

+---+------------------+-------+-------+
| ID|              Name| Salary|Country|
+---+------------------+-------+-------+
|  1|  slkfkjshfdkjlhdk|  60000|  India|
|  2|kdhflkdshflkhdlkjf|  17974|  India|
|  3|      udgfiuiewrui|1753752|  India|
+---+------------------+-------+-------+



In [15]:
df4= df3.withColumn('copiedSalary',col('Salary'))

df4.show()

+---+------------------+-------+-------+------------+
| ID|              Name| Salary|Country|copiedSalary|
+---+------------------+-------+-------+------------+
|  1|  slkfkjshfdkjlhdk|  60000|  India|       60000|
|  2|kdhflkdshflkhdlkjf|  17974|  India|       17974|
|  3|      udgfiuiewrui|1753752|  India|     1753752|
+---+------------------+-------+-------+------------+



In [16]:
help(df.withColumnRenamed)

Help on method withColumnRenamed in module pyspark.sql.dataframe:

withColumnRenamed(existing, new) method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` by renaming an existing column.
    This is a no-op if schema doesn't contain the given column name.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    existing : str
        string, name of the existing column to rename.
    new : str
        string, new name of the column.
    
    Examples
    --------
    >>> df.withColumnRenamed('age', 'age2').collect()
    [Row(age2=2, name='Alice'), Row(age2=5, name='Bob')]



In [17]:
df.printSchema()

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: string (nullable = true)



In [18]:
df.withColumnRenamed('Salary','Salary_Amount')

df.show()

+---+------------------+------+
| ID|              Name|Salary|
+---+------------------+------+
|  1|  slkfkjshfdkjlhdk| 30000|
|  2|kdhflkdshflkhdlkjf|  8987|
|  3|      udgfiuiewrui|876876|
+---+------------------+------+



In [19]:
df5= df.withColumnRenamed('Salary','Salary_Amount')

df5.show()

+---+------------------+-------------+
| ID|              Name|Salary_Amount|
+---+------------------+-------------+
|  1|  slkfkjshfdkjlhdk|        30000|
|  2|kdhflkdshflkhdlkjf|         8987|
|  3|      udgfiuiewrui|       876876|
+---+------------------+-------------+

