In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=6e7f390b960372864f979332db45f6e158342ee61a25a4d14fb33e0d7136206c
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
import numpy as np
from pyspark import SparkContext

# Setting the seed for reproducibility
np.random.seed(10)

# Generating 100 random numbers between 0 and 10
random_numbers = np.random.randint(0, 11, 100)

# Creating a Spark context (assuming Spark is installed and configured)
sc = SparkContext.getOrCreate()

# Creating an RDD using the generated random numbers
rdd = sc.parallelize(random_numbers)

# Calculating the frequency of each number
frequency = rdd.countByValue()

# Sorting the results for better readability
sorted_frequency = dict(sorted(frequency.items()))

print(sorted_frequency)


{0: 12, 1: 11, 2: 8, 3: 6, 4: 8, 5: 5, 6: 11, 7: 5, 8: 14, 9: 12, 10: 8}


In [4]:
from pyspark import SparkContext

# Creating a Spark context
sc = SparkContext.getOrCreate()

# Loading the text8 dataset into an RDD
text8_rdd = sc.textFile("/content/text8")

# Splitting the dataset into words, mapping each word to (word, 1), reducing by key, and filtering words containing 'a'
word_counts = (text8_rdd.flatMap(lambda line: line.split())
                         .map(lambda word: (word, 1))
                         .reduceByKey(lambda a, b: a + b)
                         .filter(lambda word_count: 'a' in word_count[0]))

# Collecting the result to output
word_counts_with_a = word_counts.collect()

# Showing a subset of the results due to potential size
word_counts_with_a[:]


[('anarchism', 159),
 ('as', 17970),
 ('against', 1291),
 ('class', 384),
 ('radicals', 33),
 ('sans', 8),
 ('way', 832),
 ('act', 588),
 ('means', 562),
 ('anarchists', 113),
 ('political', 743),
 ('are', 10090),
 ('unnecessary', 25),
 ('abolished', 43),
 ('although', 1288),
 ('interpretations', 50),
 ('related', 422),
 ('social', 503),
 ('advocate', 43),
 ('particularly', 400),
 ('state', 1773),
 ('chaos', 30),
 ('rather', 646),
 ('anti', 812),
 ('regarded', 229),
 ('based', 1069),
 ('voluntary', 49),
 ('autonomous', 75),
 ('individuals', 255),
 ('mutual', 36),
 ('governance', 16),
 ('easily', 159),
 ('an', 9450),
 ('anarchist', 122),
 ('considerably', 72),
 ('especially', 520),
 ('human', 867),
 ('was', 16664),
 ('gatherer', 6),
 ('bands', 60),
 ('egalitarian', 4),
 ('labour', 84),
 ('accumulated', 19),
 ('wealth', 120),
 ('law', 927),
 ('equal', 184),
 ('taoism', 12),
 ('ancient', 704),
 ('similar', 642),
 ('individual', 321),
 ('bertrand', 24),
 ('at', 7926),
 ('arrive', 31),
 ('m

In [5]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("IrisDataAnalysis").getOrCreate()

# Load the iris JSON data into a Spark DataFrame
iris_df = spark.read.json("/content/iris.json")


#Calculate the Pearson correlation coefficient between petalLength and petalWidth using the corr function from the DataFrame API.

petal_length_width_corr = iris_df.stat.corr("petalLength", "petalWidth")
print(f"Pearson Correlation coefficient between petalLength and petalWidth: {petal_length_width_corr}")


#Filter the rows where petalLength is greater than or equal to 1.4 and select only the sepalLength, sepalWidth, and species columns to display.

filtered_df = iris_df.filter(iris_df.petalLength >= 1.4).select("sepalLength", "sepalWidth", "species")
filtered_df.show()


spark.stop()

Pearson Correlation coefficient between petalLength and petalWidth: 0.9626417223780231
+-----------+----------+-------+
|sepalLength|sepalWidth|species|
+-----------+----------+-------+
|        5.1|       3.5| setosa|
|        4.9|       3.0| setosa|
|        4.6|       3.1| setosa|
|        5.0|       3.6| setosa|
|        5.4|       3.9| setosa|
|        4.6|       3.4| setosa|
|        5.0|       3.4| setosa|
|        4.4|       2.9| setosa|
|        4.9|       3.1| setosa|
|        5.4|       3.7| setosa|
|        4.8|       3.4| setosa|
|        4.8|       3.0| setosa|
|        5.7|       4.4| setosa|
|        5.1|       3.5| setosa|
|        5.7|       3.8| setosa|
|        5.1|       3.8| setosa|
|        5.4|       3.4| setosa|
|        5.1|       3.7| setosa|
|        5.1|       3.3| setosa|
|        4.8|       3.4| setosa|
+-----------+----------+-------+
only showing top 20 rows

