In [1]:
import pyspark.sql
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.Builder().appName('Sparky').getOrCreate()

In [3]:
df = spark.read.csv(r'C:\Users\muham\Downloads\cars2.csv', header= True)

In [4]:
# show the netire dataset description
df.show()

+---+------------+-------------+------------+
| cd|cars_per_cap|      country|drives_right|
+---+------------+-------------+------------+
| US|         809|United States|        TRUE|
|AUS|         731|    Australia|       FALSE|
|JAP|         588|        Japan|       FALSE|
| IN|          18|        India|       FALSE|
| RU|         200|       Russia|        TRUE|
|MOR|          70|      Morocco|        TRUE|
| EG|          45|        Egypt|        TRUE|
+---+------------+-------------+------------+



In [5]:
# we find the structure insight of the content
df.printSchema()

root
 |-- cd: string (nullable = true)
 |-- cars_per_cap: string (nullable = true)
 |-- country: string (nullable = true)
 |-- drives_right: string (nullable = true)



In [105]:
# we create a square root function to be used later
def sqre(x):
    return x**2

In [6]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType, FloatType

In [107]:
# we convert cars_per_cap to integer
df= df.withColumn('cars_per_cap', df['cars_per_cap'].cast(IntegerType()))

In [108]:
df.printSchema()

root
 |-- cd: string (nullable = true)
 |-- cars_per_cap: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- drives_right: string (nullable = true)



In [109]:
df['cd','country'].show()

+---+-------------+
| cd|      country|
+---+-------------+
| US|United States|
|AUS|    Australia|
|JAP|        Japan|
| IN|        India|
| RU|       Russia|
|MOR|      Morocco|
| EG|        Egypt|
+---+-------------+



In [110]:
df.filter(df.cars_per_cap > 500).show()

+---+------------+-------------+------------+
| cd|cars_per_cap|      country|drives_right|
+---+------------+-------------+------------+
| US|         809|United States|        TRUE|
|AUS|         731|    Australia|       FALSE|
|JAP|         588|        Japan|       FALSE|
+---+------------+-------------+------------+



In [111]:
df.select(df.country, df.cars_per_cap, sqre(df.cars_per_cap).alias('square_root')).show()

+-------------+------------+-----------+
|      country|cars_per_cap|square_root|
+-------------+------------+-----------+
|United States|         809|   654481.0|
|    Australia|         731|   534361.0|
|        Japan|         588|   345744.0|
|        India|          18|      324.0|
|       Russia|         200|    40000.0|
|      Morocco|          70|     4900.0|
|        Egypt|          45|     2025.0|
+-------------+------------+-----------+



In [112]:
def car_intervals(z):
    if z > 700:
        x = 'Cars are alot'
    elif z > 500:
        x = 'Cars are fairly in large numbers'
    elif z > 300:
        x = 'Cars are fair in numbers'
    else:
        x = 'Cars per cap are very less to be considered'
    
    return x

In [116]:
cars = int(input('Enter your cars: ') )
score = car_intervals(cars)
print('Thus its shown %d of %s' % (cars,score))

Enter your cars:  5


Thus its shown 5 of Cars per cap are very less to be considered


In [117]:
def catg(x):
    if x >= 700:
         r = 'too high'
    elif x >= 600:
        r = 'mediocre'
    elif x >= 500:
        r = 'substantial'
    elif x >= 400:
        r = 'can do better'
    elif x >= 200:
        r = 'Average'
    else:
        r = 'needs more work'
    return r

In [118]:
catg1 = udf(lambda x: catg(x), StringType())

In [119]:
df.select('country', df.cars_per_cap, catg1(df.cars_per_cap).alias('atg1')).show()

+-------------+------------+---------------+
|      country|cars_per_cap|           atg1|
+-------------+------------+---------------+
|United States|         809|       too high|
|    Australia|         731|       too high|
|        Japan|         588|    substantial|
|        India|          18|needs more work|
|       Russia|         200|        Average|
|      Morocco|          70|needs more work|
|        Egypt|          45|needs more work|
+-------------+------------+---------------+



In [134]:
# we can use lamda to create a single line function without issues
category = udf(lambda s: 'high' if s>= 600 else('Medium' if s >= 400 else 'low'), StringType())

In [133]:
df.select(df.country, df.cars_per_cap, catg1(df.cars_per_cap).alias('Sqre root'),category(df.cars_per_cap).alias('rank')).show()

+-------------+------------+---------------+------+
|      country|cars_per_cap|      Sqre root|  rank|
+-------------+------------+---------------+------+
|United States|         809|       too high|  high|
|    Australia|         731|       too high|  high|
|        Japan|         588|    substantial|Medium|
|        India|          18|needs more work|   low|
|       Russia|         200|        Average|   low|
|      Morocco|          70|needs more work|   low|
|        Egypt|          45|needs more work|   low|
+-------------+------------+---------------+------+



In [136]:
df = df.withColumn('cars_per-cap', df['cars_per_cap'].cast(IntegerType()))

In [137]:
df.show()

+---+------------+-------------+------------+------------+
| cd|cars_per_cap|      country|drives_right|cars_per-cap|
+---+------------+-------------+------------+------------+
| US|         809|United States|        TRUE|         809|
|AUS|         731|    Australia|       FALSE|         731|
|JAP|         588|        Japan|       FALSE|         588|
| IN|          18|        India|       FALSE|          18|
| RU|         200|       Russia|        TRUE|         200|
|MOR|          70|      Morocco|        TRUE|          70|
| EG|          45|        Egypt|        TRUE|          45|
+---+------------+-------------+------------+------------+



In [138]:
df.dtypes

[('cd', 'string'),
 ('cars_per_cap', 'int'),
 ('country', 'string'),
 ('drives_right', 'string'),
 ('cars_per-cap', 'int')]