In [1]:
from pyspark.sql import SparkSession
import os

# Aplinkos paruošimas
os.environ["PYSPARK_PYTHON"] = r"C:\Users\Eivinas\DDA\ld2\venv\Scripts\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\Eivinas\DDA\ld2\venv\Scripts\python.exe"

spark = SparkSession.builder \
    .appName("DDA LD2") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.local.dir", "C:/tmp/spark-temp") \
    .getOrCreate()

sc = spark.sparkContext

# Patikriname ar veikia
spark

In [2]:
# Pagalbinės funkcijos
def parse_line(line):
    entries = line.strip().split("{{")
    all_entries = []
    for entry in entries:
        entry_dict = {}
        if len(entry) == 0: continue
        for field in entry.split("}{"):
            field = field.strip("}")
            print(field)
            key, value = field.split("=")
            if not key or not value: continue
            entry_dict[key.strip()] = value.strip()
        all_entries.append(entry_dict)
    return all_entries

# Pirmam uždaviniui
def calculate_weight_group(weights):
    weights = list(weights)
    return {
        "min": min(weights),
        "max": max(weights),
        "avg": sum(weights) / len(weights) if weights else 0
    }

# Saugiai konvertuoti į bet kokį duomenų tipą 
def convert(val, type_name):
    try:
        return type_name(val)
    except:
        return None

In [3]:
# Read file
rdd = sc.textFile("duom_full.txt")

# Parse each line
parsed_rdd = rdd.flatMap(parse_line)

In [4]:
# Pirmas uždavinys
weights_rdd = parsed_rdd.map(lambda x: (x.get("svorio grupe"), x.get("svoris"))) \
    .filter(lambda x: x[0] is not None and x[1] is not None) \
    .map(lambda x: (x[0], convert(x[1], float)))

results = weights_rdd.groupByKey().mapValues(calculate_weight_group).collect()
results

[('<50', {'min': 0.0, 'max': 50.0, 'avg': 5.964402623894628}),
 ('>300', {'min': 300.05, 'max': 6896.65, 'avg': 759.2116686954745}),
 ('<300', {'min': 50.05, 'max': 300.0, 'avg': 110.03327885846451})]

In [None]:
routes_rdd = parsed_rdd.map(lambda x: (x.get("marsrutas"), x.get("geografine zona")))\
    .filter(lambda x: x[0] is not None and x[1] is not None)

total_routes = routes_rdd.map(lambda x: x[0]).distinct().count()

route_by_zone = routes_rdd.groupByKey()\
    .filter(lambda x: len(set(x[1])) >= 2)

routes_count = route_by_zone.count()
print(f"Routes with 2+ zones: {routes_count}/{total_routes} {routes_count/total_routes*100:.2f}%")

Routes with 2+ zones: 396/424 93.40%


In [None]:
routes_rdd = parsed_rdd.map(lambda x: ((x.get("marsrutas"), x.get("sustojimo data")), x.get("geografine zona"))) \
    .filter(lambda x: x[0][0] is not None and x[0][1] is not None and x[1] is not None)

multi_zone_route_days = routes_rdd.groupByKey().filter(lambda x: len(set(x[1])) >= 2)
routes_with_multi_zone_days = multi_zone_route_days.map(lambda x: x[0][0]).distinct()

num_routes = routes_with_multi_zone_days.count()
print(f"Routes with 2+ zones on same day: {num_routes}/{total_routes} {num_routes/total_routes*100:.2f}%")

Routes with 2+ zones on same day: 394/424 92.92%


In [None]:
zone_day = parsed_rdd.map(lambda x : (
    (x.get("geografine zona"), x.get('sustojimo savaites diena')),
    (convert(x.get("siuntu skaicius"), int), convert(x.get("Sustojimo klientu skaicius"), int))
)).filter(lambda x: None not in x[0] and None not in x[1])

zone_day_sums = zone_day.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
zone_day_sums = zone_day_sums.sortByKey()

results = zone_day_sums.collect()
print("Zona\tDiena\tKlientai Siuntos")
for record in results:
    (zone, day), (total_packages, total_clients) = record
    print(f"{zone}\t{day}\t{total_clients}\t{total_packages}")

Zona	Diena	Klientai Siuntos
Z1	1	41580	86464
Z1	2	60167	123938
Z1	3	62241	126872
Z1	4	48430	98156
Z1	5	45999	88663
Z1	6	797	1754
Z2	1	11751	19525
Z2	2	18499	31377
Z2	3	18545	30715
Z2	4	14190	23727
Z2	5	14027	23883
Z2	6	72	136
Z3	1	11002	17149
Z3	2	17184	25642
Z3	3	17246	25785
Z3	4	12887	19499
Z3	5	12304	18423
Z3	6	23	26
