## Inserting Trimmed Data into MongoDB

In [6]:
import json
from pymongo import MongoClient
import csv

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["Project"]

# Drop old collections if they exist
# db["business"].drop()


# === Trim and load only first 1000 rows of Bike data ===
with open("project_file.csv", "r", encoding="utf-8") as f:
    reader = csv.DictReader(f) 
    BikeData = [row for _, row in zip(range(1000), reader)]
db["Project"].insert_many(BikeData)
print(f"✅ Inserted {len(BikeData)} rows of data")


✅ Inserted 1000 rows of data


## Starting Spark Session

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Bike Data Analysis").getOrCreate()

## Loading data from MongoDB via PyMongo

In [8]:
# Read documents, exclude '_id'
BikeData_docs = list(db["Project"].find({}, {"_id": 0}))

# Convert to Spark DataFrames
BikeData = spark.createDataFrame(BikeData_docs)

BikeData.show(10)

+--------------------+--------------------+----------------------+---------------+-------------------+-----------------+-------------------+-------------------+
|Covered distance (m)|Departure station id|Departure station name|Duration (sec.)|             Return|Return station id|Return station name|         ﻿Departure|
+--------------------+--------------------+----------------------+---------------+-------------------+-----------------+-------------------+-------------------+
|                1885|                 016|       Liisanpuistikko|            499|2021-05-01T00:08:15|              002|    Laivasillankatu|2021-04-30T23:59:54|
|                3359|                 020|     Kaisaniemenpuisto|            984|2021-05-01T00:16:16|              112| Rautatieläisenkatu|2021-04-30T23:59:52|
|                1633|                 004|             Viiskulma|            824|2021-05-01T00:13:40|              033|  Kauppakorkeakoulu|2021-04-30T23:59:52|
|                6459|            

In [None]:
from pyspark.sql.functions import avg
# Select only necessary fields

data_selected = BikeData.select(
    "Covered distance (m)",
    "Departure station name",
    "Return station name",
    "Duration (sec.)"
)
# Compute average
df_avg = data_selected.groupBy("business_id", "name", "categories") \
    .agg(avg("review_stars").alias("avg_rating")) \
    .orderBy("avg_rating", ascending=False)

df_avg.show(10, truncate=False)

In [None]:
from pyspark.sql.functions import avg

data_df = spark.createDataFrame(BikeData_docs)
# Valitaan olennaiset sarakkeet
data_selected = data_df.select(
    "Covered distance (m)",
    "Departure station name",
    "Return station name",
    "Duration (sec.)"
)

# Lasketaan keskiarvot lähtö- ja paluuaseman mukaan
df_avg = data_selected.groupBy(
    "Departure station name",
    "Return station name"
).agg(
    avg("Covered distance (m)").alias("avg_distance_m"),
    avg("Duration (sec.)").alias("avg_duration_sec")
).orderBy("avg_distance_m", ascending=False)

# Näytetään 10 pisimmän keskimatkan asemaparia
df_avg.show(10, truncate=False)
