In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from datetime import datetime
from pyspark.sql.window import Window

# Initialize Spark Session
spark = SparkSession.builder.appName("CompanyUsers").getOrCreate()

# Define Schema
schema = StructType([
    StructField("company_id", IntegerType(), True),
    StructField("user_id", IntegerType(), True),
    StructField("language", StringType(), True)
])

# Define Data
data = [
    (1, 1, 'English'), (1, 1, 'German'), (1, 2, 'English'),
    (1, 3, 'German'), (1, 3, 'English'), (1, 4, 'English'),
    (2, 5, 'English'), (2, 5, 'German'), (2, 5, 'Spanish'),
    (2, 6, 'German'), (2, 6, 'Spanish'), (2, 7, 'English')
]

# Create DataFrame
company_users_df = spark.createDataFrame(data, schema=schema)

# Show Data
company_users_df.show()

company_users_df.createOrReplaceTempView("users")



+----------+-------+--------+
|company_id|user_id|language|
+----------+-------+--------+
|         1|      1| English|
|         1|      1|  German|
|         1|      2| English|
|         1|      3|  German|
|         1|      3| English|
|         1|      4| English|
|         2|      5| English|
|         2|      5|  German|
|         2|      5| Spanish|
|         2|      6|  German|
|         2|      6| Spanish|
|         2|      7| English|
+----------+-------+--------+



In [9]:
spark.sql(
"""
    select company_id, count(1) from (
    select company_id, user_id, count(*)
    from users
    where language in ('English', 'German')
    group by company_id, user_id
    having count(1) = 2)
    group by company_id
    having count(1) >= 2
""").show()

+----------+--------+
|company_id|count(1)|
+----------+--------+
|         1|       2|
+----------+--------+

