1) How to convert the index of a PySpark DataFrame into a column?

```python
# Input: Assuming df is your DataFrame
df = spark.createDataFrame([
("Alice", 1),
("Bob", 2),
("Charlie", 3),
], ["Name", "Value"])

df.show()

+-------+-----+
| Name|Value|
+-------+-----+
| Alice| 1|
| Bob| 2|
|Charlie| 3|
+-------+-----+

# Output:
+-------+-----+-----+
| Name|Value|index|
+-------+-----+-----+
| Alice| 1| 0|
| Bob| 2| 1|
|Charlie| 3| 2|
+-------+-----+-----+
```

In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=03f253645719b7bc043098db7ae7a0b571c52ce8acdc4ed5334e1aea65ed0f4e
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("IndexColumnExample") \
    .getOrCreate()

# Assuming df is your DataFrame
df = spark.createDataFrame([
    ("Alice", 1),
    ("Bob", 2),
    ("Charlie", 3),
], ["Name", "Value"])

# Add a new column with index
df = df.withColumn("index", monotonically_increasing_id())

# Show the DataFrame
df.show()

# Stop SparkSession
spark.stop()


+-------+-----+----------+
|   Name|Value|     index|
+-------+-----+----------+
|  Alice|    1|         0|
|    Bob|    2|8589934592|
|Charlie|    3|8589934593|
+-------+-----+----------+



2) How to get the minimum, 25th percentile, median, 75th, and max of a numeric column?

Compute the minimum, 25th percentile, median, 75th, and maximum of column `Age`

```python
# Create a sample DataFrame
data = [("A", 10), ("B", 20), ("C", 30), ("D", 40), ("E", 50), ("F", 15), ("G", 28), ("H", 54), ("I", 41), ("J", 86)]
df = spark.createDataFrame(data, ["Name", "Age"])
```

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("NumericColumnSummary") \
    .getOrCreate()

# Create a sample DataFrame
data = [("A", 10), ("B", 20), ("C", 30), ("D", 40), ("E", 50), ("F", 15), ("G", 28), ("H", 54), ("I", 41), ("J", 86)]
df = spark.createDataFrame(data, ["Name", "Age"])

# Compute summary statistics
summary = df.select("Age").summary("min", "25%", "50%", "75%", "max")

# Extract values from the summary DataFrame
min_age = summary.filter(col("summary") == "min").select("Age").collect()[0][0]
p25_age = summary.filter(col("summary") == "25%").select("Age").collect()[0][0]
median_age = summary.filter(col("summary") == "50%").select("Age").collect()[0][0]
p75_age = summary.filter(col("summary") == "75%").select("Age").collect()[0][0]
max_age = summary.filter(col("summary") == "max").select("Age").collect()[0][0]

# Display the results
print("Minimum Age:", min_age)
print("25th Percentile Age:", p25_age)
print("Median Age:", median_age)
print("75th Percentile Age:", p75_age)
print("Maximum Age:", max_age)

# Stop SparkSession
spark.stop()


Minimum Age: 10
25th Percentile Age: 20
Median Age: 30
75th Percentile Age: 50
Maximum Age: 86


3) Calculte the frequency counts of each unique value

```python
from pyspark.sql import Row

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]

# create DataFrame
df = spark.createDataFrame(data)
```

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("ValueFrequencyCounts") \
    .getOrCreate()

# Sample data
data = [
    Row(name='John', job='Engineer'),
    Row(name='John', job='Engineer'),
    Row(name='Mary', job='Scientist'),
    Row(name='Bob', job='Engineer'),
    Row(name='Bob', job='Engineer'),
    Row(name='Bob', job='Scientist'),
    Row(name='Sam', job='Doctor'),
]

# create DataFrame
df = spark.createDataFrame(data)

# Calculate frequency counts
freq_counts = df.groupBy("name", "job").count()

# Show the frequency counts
freq_counts.show()

# Stop SparkSession
spark.stop()


+----+---------+-----+
|name|      job|count|
+----+---------+-----+
|Mary|Scientist|    1|
|John| Engineer|    2|
| Sam|   Doctor|    1|
| Bob| Engineer|    2|
| Bob|Scientist|    1|
+----+---------+-----+



4) How to keep only top 2 most frequent values as it is and replace everything else as `Other`?

```python
from pyspark.sql import Row

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]

# create DataFrame
df = spark.createDataFrame(data)
```

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col, desc, when

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Top2FrequentValues") \
    .getOrCreate()

# Sample data
data = [
    Row(name='John', job='Engineer'),
    Row(name='John', job='Engineer'),
    Row(name='Mary', job='Scientist'),
    Row(name='Bob', job='Engineer'),
    Row(name='Bob', job='Engineer'),
    Row(name='Bob', job='Scientist'),
    Row(name='Sam', job='Doctor'),
]

# Create DataFrame
df = spark.createDataFrame(data)

# Calculate frequency counts
freq_counts = df.groupBy("job").count()

# Identify the top 2 most frequent values
top2 = freq_counts.orderBy(desc("count")).limit(2).select("job").rdd.flatMap(lambda x: x).collect()

# Replace everything else with "Other"
df = df.withColumn("job", when(col("job").isin(top2), col("job")).otherwise("Other"))

# Show the modified DataFrame
df.show()

# Stop SparkSession
spark.stop()


+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|    Other|
+----+---------+



5) How to rename columns of a PySpark DataFrame using two lists – one containing the old column names and the other containing the new column names?

```python
# suppose you have the following DataFrame
df = spark.createDataFrame([(1, 2, 3), (4, 5, 6)], ["col1", "col2", "col3"])

# old column names
old_names = ["col1", "col2", "col3"]

# new column names
new_names = ["new_col1", "new_col2", "new_col3"]
```

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("ColumnRenameExample") \
    .getOrCreate()

# Create DataFrame
df = spark.createDataFrame([(1, 2, 3), (4, 5, 6)], ["col1", "col2", "col3"])

# Old column names
old_names = ["col1", "col2", "col3"]

# New column names
new_names = ["new_col1", "new_col2", "new_col3"]

# Rename columns
df_renamed = df.select([col(old).alias(new) for old, new in zip(old_names, new_names)])

# Show the renamed DataFrame
df_renamed.show()

# Stop SparkSession
spark.stop()


+--------+--------+--------+
|new_col1|new_col2|new_col3|
+--------+--------+--------+
|       1|       2|       3|
|       4|       5|       6|
+--------+--------+--------+



6) How to bin a numeric list to 10 groups of equal size?

```python
from pyspark.sql.functions import rand
from pyspark.ml.feature import Bucketizer

# Create a DataFrame with a single column "values" filled with random numbers
num_items = 100
df = spark.range(num_items).select(rand(seed=42).alias("values"))
```

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand
from pyspark.ml.feature import Bucketizer
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("NumericBinningExample") \
    .getOrCreate()

# Create a DataFrame with a single column "values" filled with random numbers
num_items = 100
df = spark.range(num_items).select(rand(seed=42).alias("values"))

# Define the number of bins
num_bins = 10

# Calculate bin boundaries
min_value = df.agg({"values": "min"}).collect()[0][0]
max_value = df.agg({"values": "max"}).collect()[0][0]
bin_size = (max_value - min_value) / num_bins
bin_boundaries = [min_value + i * bin_size for i in range(num_bins + 1)]

# Create Bucketizer transformer
bucketizer = Bucketizer(splits=bin_boundaries, inputCol="values", outputCol="bin")

# Apply Bucketizer to the DataFrame
df_binned = bucketizer.transform(df)

# Show the binned DataFrame
df_binned.select("values", "bin").show()

# Stop SparkSession
spark.stop()


+--------------------+---+
|              values|bin|
+--------------------+---+
|   0.619189370225301|6.0|
|  0.5096018842446481|5.0|
|  0.8325259388871524|8.0|
| 0.26322809041172357|2.0|
|  0.6702867696264135|6.0|
|  0.5173283545794627|5.0|
|  0.9991441647585968|9.0|
| 0.06993233728279169|0.0|
|  0.9696695610826327|9.0|
|  0.7959575617927873|7.0|
|  0.4484250584033179|4.0|
|  0.6793959570375868|6.0|
|  0.3724113862805264|3.0|
|   0.832609472539921|8.0|
|  0.7479557402720448|7.0|
|  0.7216183163402288|7.0|
|0.016051221049720343|0.0|
|  0.6307120027798567|6.0|
|    0.07537082371587|0.0|
|   0.838930558220017|8.0|
+--------------------+---+
only showing top 20 rows

