In [2]:
# Установка необходимых пакетов
!apt-get install openjdk-8-jdk -y
!pip install pyspark
!pip install findspark

# Импорт библиотек и инициализация Spark
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, StringType, FloatType, DateType
import random
from datetime import datetime, timedelta
import os
import shutil
from google.colab import files

# Создание SparkSession
spark = SparkSession.builder \
    .appName("EcommerceDataGeneration") \
    .getOrCreate()

# Параметры генерации данных
num_rows = 1000  # Количество строк в наборе данных
products = ['Laptop', 'Smartphone', 'Tablet', 'Headphones', 'Smartwatch']
price_range = (50, 1000)  # Диапазон цен
quantity_range = (1, 10)  # Диапазон количества

# Функция генерации данных
def generate_order_data(num_rows):
    data = []
    start_date = datetime.now() - timedelta(days=365)  # Последний год
    for _ in range(num_rows):
        date = start_date + timedelta(days=random.randint(0, 365))
        user_id = random.randint(1, 1000)
        product = random.choice(products)
        quantity = random.randint(*quantity_range)
        price = round(random.uniform(*price_range), 2)
        data.append((date, user_id, product, quantity, price))
    return data

# Создание DataFrame
columns = ["Date", "UserID", "Product", "Quantity", "Price"]
order_data = generate_order_data(num_rows)
orders_df = spark.createDataFrame(order_data, schema=columns)

# Создание директории для сохранения файла
single_file_path = "/content/ecommerce_orders_single.csv"

# Сохранение данных в один файл
orders_df.coalesce(1).write.csv(single_file_path, header=True, mode="overwrite")

# Найдем сохраненный файл и переименуем его для скачивания
for file_name in os.listdir(single_file_path):
    if file_name.startswith("part-") and file_name.endswith(".csv"):
        # Переименовываем его в "ecommerce_orders.csv" для удобства скачивания
        shutil.move(f"{single_file_path}/{file_name}", "/content/ecommerce_orders.csv")

# Скачивание файла
files.download("/content/ecommerce_orders.csv")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java libatk-wrapper-java-jni libfontenc1
  libgail-common libgail18 libgtk2.0-0 libgtk2.0-bin libgtk2.0-common libice-dev librsvg2-common
  libsm-dev libxkbfile1 libxt-dev libxtst6 libxxf86dga1 openjdk-8-jdk-headless openjdk-8-jre
  openjdk-8-jre-headless x11-utils
Suggested packages:
  gvfs libice-doc libsm-doc libxt-doc openjdk-8-demo openjdk-8-source visualvm libnss-mdns
  fonts-nanum fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei
  fonts-indic mesa-utils
The following NEW packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java libatk-wrapper-java-jni libfontenc1
  libgail-common libgail18 libgtk2.0-0 libgtk2.0-bin libgtk2.0-common libice-dev librsvg2-common
  libsm-dev libxkbfile1 libxt-dev libxtst6 libxxf86dga1 openjdk-

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>