In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("OddEvenCount").getOrCreate()

In [0]:
# Read the integer.txt file
# Adjust the path according to your file location in Databricks
df = spark.read.text("/FileStore/tables/integer.txt")

In [0]:
# Show the content of the dataframe
df.show()

+-----+
|value|
+-----+
|45687|
| 5463|
|34443|
|  223|
|  456|
| 4667|
|  234|
| 9008|
| 1234|
| 2597|
| 6253|
|10399|
| 3312|
| 2175|
|20087|
| 6698|
|31512|
|  618|
|11698|
|13969|
+-----+
only showing top 20 rows



In [0]:
# Convert the dataframe to RDD and extract the integers
numbers_rdd = df.rdd.map(lambda row: int(row[0]))

In [0]:
numbers_rdd.collect()

Out[12]: [45687,
 5463,
 34443,
 223,
 456,
 4667,
 234,
 9008,
 1234,
 2597,
 6253,
 10399,
 3312,
 2175,
 20087,
 6698,
 31512,
 618,
 11698,
 13969,
 6575,
 24846,
 1313,
 30977,
 20927,
 24771,
 20293,
 24210,
 26005,
 9680,
 15297,
 4099,
 29625,
 9374,
 7645,
 7660,
 7025,
 15755,
 927,
 22795,
 7948,
 29343,
 23398,
 5755,
 27339,
 19416,
 3686,
 21687,
 16513,
 32643,
 10047,
 11831,
 14036,
 11079,
 23852,
 11767,
 18502,
 28848,
 21754,
 4309,
 4104,
 4656,
 18641,
 20317,
 29760,
 8784,
 20048,
 26629,
 16134,
 15865,
 26432,
 22918,
 20931,
 15958,
 17856,
 13488,
 771,
 28089,
 15865,
 22489,
 5740,
 12871,
 28999,
 9373,
 31617,
 8185,
 19684,
 25075,
 17319,
 20320,
 21116,
 32064,
 15681,
 11990,
 12927,
 31383,
 2469,
 21500,
 4933,
 21212,
 12974,
 367,
 19773,
 7553,
 16500,
 17984,
 23355,
 10587,
 19628,
 31895,
 16349,
 27796,
 7806,
 13608,
 2427,
 8765,
 6128,
 20983,
 24368,
 3012,
 4519,
 7670,
 16815,
 26142,
 5599,
 13534,
 6320,
 30598,
 18053,
 26646,
 748

In [0]:

# Function to determine if a number is odd or even
def odd_even(num):
    if num % 2 == 0:
        return ("Even", 1)
    else:
        return ("Odd", 1)

In [0]:
# Map the numbers to odd/even and reduce by key to count them
count_rdd = numbers_rdd.map(odd_even).reduceByKey(lambda a, b: a + b)

In [0]:
# Collect the result
result = count_rdd.collect()


In [0]:
# Print the result
for key, count in result:
    print(f"{key}: {count}")

Odd: 496
Even: 514


In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, sum as spark_sum

In [0]:
# Create Spark session
spark = SparkSession.builder.appName("SalarySumPerDepartment").getOrCreate()

In [0]:
# Read the salary.txt file
# Adjust the path according to your file location in Databricks
df = spark.read.text("/FileStore/tables/salary.txt")

In [0]:
# Split the lines into department and salary
split_col = split(df['value'], ' ')
df = df.withColumn('department', split_col.getItem(0))
df = df.withColumn('salary', split_col.getItem(1).cast('float'))

In [0]:
display(df)

value,department,salary
Sales 9136,Sales,9136.0
Research 13391,Research,13391.0
Developer 22220,Developer,22220.0
QA 31888,QA,31888.0
Marketing 22215,Marketing,22215.0
Sales 45567,Sales,45567.0
Research 4023,Research,4023.0
Developer 7262,Developer,7262.0
QA 5243,QA,5243.0
Marketing 11425,Marketing,11425.0


In [0]:
# Group by department and calculate the sum of salaries
result = df.groupBy('department').agg(spark_sum('salary').alias('salary_sum'))

In [0]:
result.show()

+----------+----------+
|department|salary_sum|
+----------+----------+
|     Sales| 3488491.0|
| Developer| 3221394.0|
|  Research| 3328284.0|
| Marketing| 3158450.0|
|        QA| 3360624.0|
+----------+----------+



In [0]:
display(result)

department,salary_sum
Sales,3488491.0
Developer,3221394.0
Research,3328284.0
Marketing,3158450.0
QA,3360624.0


In [0]:
# Create Spark session
spark = SparkSession.builder.appName("WordCount").getOrCreate()

# Read the shakespeare.txt file
# Adjust the path according to your file location in Databricks
df = spark.read.text("/FileStore/tables/shakespeare_1.txt")

In [0]:
display(df)

value
This eBook is for the use of anyone anywhere at no cost and with
"almost no restrictions whatsoever. You may copy it, give it away or"
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org
"** This is a COPYRIGHTED Project Gutenberg eBook, Details Below **"
** Please follow the copyright guidelines in this file. **
Title: The Complete Works of William Shakespeare
Author: William Shakespeare
"Posting Date: September 1, 2011 [EBook #100]"
"Release Date: January, 1994"


In [0]:
# Convert the dataframe to RDD
lines_rdd = df.rdd.map(lambda row: row[0])

In [0]:
lines_rdd.collect()

Out[25]: ['This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or',
 're-use it under the terms of the Project Gutenberg License included',
 'with this eBook or online at www.gutenberg.org',
 '',
 '** This is a COPYRIGHTED Project Gutenberg eBook, Details Below **',
 '**     Please follow the copyright guidelines in this file.     **',
 '',
 'Title: The Complete Works of William Shakespeare',
 '',
 'Author: William Shakespeare',
 '',
 'Posting Date: September 1, 2011 [EBook #100]',
 'Release Date: January, 1994',
 '',
 'Language: English',
 '',
 'Character set encoding: ASCII',
 '',
 '*** START OF THIS PROJECT GUTENBERG EBOOK COMPLETE WORKS--WILLIAM SHAKESPEARE ***',
 '',
 '',
 '',
 '',
 'Produced by World Library, Inc., from their Library of the Future',
 '',
 '',
 '',
 '',
 'This is the 100th Etext file presented by Project Gutenberg, and',
 'is presented in cooperation with World Library, Inc., from t

In [0]:
# List of words to count
words_to_count = ["Shakespeare", "When", "Lord", "Library", "GUTENBERG", "WILLIAM", "COLLEGE", "WORLD"]

# Function to count specified words in a line
def word_count(line, words):
    line_words = line.split()
    word_counts = []
    for word in words:
        count = line_words.count(word)
        if count > 0:
            word_counts.append((word, count))
    return word_counts

In [0]:
# FlatMap the lines RDD to count occurrences of the specified words
word_counts_rdd = lines_rdd.flatMap(lambda line: word_count(line, words_to_count))

In [0]:
# Reduce by key to sum the counts for each word
word_counts = word_counts_rdd.reduceByKey(lambda a, b: a + b)

In [0]:

# Collect the result
result = word_counts.collect()

In [0]:

# Print the result
for word, count in result:
    print(f"{word}: {count}")

Shakespeare: 22
GUTENBERG: 99
Library: 2
WILLIAM: 115
WORLD: 98
COLLEGE: 98
When: 393
Lord: 341


In [0]:
from pyspark.sql.functions import explode, split, col
from pyspark.sql.types import StringType

In [0]:
# Create Spark session
spark = SparkSession.builder.appName("TopBottomWordCount").getOrCreate()

In [0]:
# Split lines into words
words_df = df.select(explode(split(col("value"), "\\s+")).alias("word"))

In [0]:
# Remove any empty strings resulting from multiple spaces
words_df = words_df.filter(words_df.word != "")

In [0]:
display(words_df)

word
This
eBook
is
for
the
use
of
anyone
anywhere
at


In [0]:
# Count the occurrences of each word
word_counts_df = words_df.groupBy("word").count()

In [0]:
# Sort the words by count in descending order for the top 15
top_15_words = word_counts_df.orderBy(col("count").desc()).limit(15)

# Sort the words by count in ascending order for the bottom 15
bottom_15_words = word_counts_df.orderBy(col("count").asc()).limit(15)

In [0]:
# Show the top 15 words
print("Top 15 words:")
top_15_words.show()


Top 15 words:
+----+-----+
|word|count|
+----+-----+
| the|11397|
| and| 8777|
|   I| 8556|
|  of| 7873|
|  to| 7421|
|   a| 5672|
|  my| 4913|
|  in| 4600|
| you| 4060|
| And| 3547|
|that| 3522|
|  is| 3481|
| his| 3226|
|with| 3175|
| not| 3129|
+----+-----+



In [0]:
# Show the bottom 15 words
print("Bottom 15 words:")
bottom_15_words.show()

Bottom 15 words:
+----------+-----+
|      word|count|
+----------+-----+
| soundness|    1|
|    spoke;|    1|
|    Paris?|    1|
|      AWAY|    1|
|occidental|    1|
|    pluck,|    1|
|commanders|    1|
|     lust.|    1|
|   'Demand|    1|
|  commits.|    1|
|     inner|    1|
|   gav'st,|    1|
| DERCETAS,|    1|
|    online|    1|
|  Acquaint|    1|
+----------+-----+

