# **Preparing data**

In [15]:
import os

# Define the path to kaggle.json (same directory as the notebook)
json_path = os.path.join(os.getcwd(), "kaggle.json")

# Move it to the correct Kaggle API location
os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
os.system(f"cp {json_path} ~/.kaggle/")

# Set correct permissions
os.system("chmod 600 ~/.kaggle/kaggle.json")

print("Kaggle authentication set up successfully!")



Kaggle authentication set up successfully!


In [17]:
import os
import shutil

# Define the new .kaggle directory inside the working directory
new_dir = os.path.join(os.getcwd(), ".kaggle")

# Create the kaggle directory if it doesn't exist
os.makedirs(new_dir, exist_ok=True)

# Move kaggle.json to the correct location
shutil.move("kaggle.json", os.path.join(new_dir, "kaggle.json"))

# Set the correct permissions
os.chmod(os.path.join(new_dir, "kaggle.json"), 0o600)

print(f"Kaggle authentication file moved to: {new_dir}")


Kaggle authentication file moved to: g:\BIGDATA\TP3\.kaggle


In [9]:
!pip install kaggle



In [11]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mkechinov/ecommerce-behavior-data-from-multi-category-store")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/mkechinov/ecommerce-behavior-data-from-multi-category-store?dataset_version_number=8...


 26%|██▋       | 1.13G/4.29G [14:57<41:57, 1.35MB/s] 


KeyboardInterrupt: 

In [None]:
import os

dataset_path = "/root/.cache/kagglehub/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store/versions/8"

# List all files
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        print(os.path.join(root, file))

/root/.cache/kagglehub/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store/versions/8/2019-Oct.csv
/root/.cache/kagglehub/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store/versions/8/2019-Nov.csv


# **Preparing Dependencies**

In [None]:
!pip install dask memory_profiler pandas pyarrow fastparquet

Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: memory_profiler, fastparquet
Successfully installed fastparquet-2024.11.0 memory_profiler-0.61.0


In [None]:
import pandas as pd
import dask.dataframe as dd
import time
from memory_profiler import memory_usage
import gc

# **Cleaning The Data**

In [None]:
import os
import glob
import datetime

# Find all CSV files in the dataset folder
csv_files = glob.glob(os.path.join(dataset_path, "*.csv"))

# Function to convert 'YYYY-MMM.csv' to a comparable datetime format
def extract_date(filename):
    base_name = os.path.basename(filename).replace(".csv", "")  # Remove path and .csv
    year, month_abbr = base_name.split("-")  # Split by "-"
    month_number = datetime.datetime.strptime(month_abbr, "%b").month  # Convert 'Oct' → 10
    return datetime.datetime(int(year), month_number, 1)  # Return a datetime object

# Get the latest file by sorting filenames (assuming format 'YYYY-MM.csv')
latest_file = max(csv_files, key=os.path.getctime)

print(f"Processing latest file: {latest_file}")


In [None]:
chunk_size = 500_000  
sample_fraction = 0.1  # 10% of data

chunks = []
for chunk in pd.read_csv(latest_file, chunksize=chunk_size):
    sampled_chunk = chunk.sample(frac=sample_fraction, random_state=42)  # Ensures reproducibility
    chunks.append(sampled_chunk)

sampled_data = pd.concat(chunks, ignore_index=True)
sampled_data.to_csv("sampled_ecommerce_data.csv", index=False)

print("Sampled dataset saved.")

In [None]:
import os
import pandas as pd
import gc

file_path = os.path.join('sampled_ecommerce_data.csv')
chunk_size = 500_000  # Adjust based on memory usage
output_file = "cleaned_ecommerce_data.csv"

first_chunk = True  # To handle writing headers correctly
event_counter = 0  # Start event_id counter

for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    # Convert event_time to datetime
    chunk["event_time"] = pd.to_datetime(chunk["event_time"], errors="coerce")

    # Remove duplicates
    chunk.drop_duplicates(inplace=True)

    # Drop category_code and handle missing brand values
    chunk.drop(columns=["category_code"], inplace=True)
    chunk.loc[:, "brand"] = chunk["brand"].fillna("Unavailable")

    # Assign unique event_id
    chunk["event_id"] = range(event_counter, event_counter + len(chunk))
    event_counter += len(chunk)  # Update the counter for the next chunk

    # Save chunk to file
    chunk.to_csv(output_file, mode="w" if first_chunk else "a", header=first_chunk, index=False)

    first_chunk = False  # Set to False after first write
    del chunk  # Free memory
    gc.collect()



# **Exploring The Data**

In [None]:
chunk_size = 500_000
selected_cols = ["event_time", "event_type", "product_id", "brand", "price"]

chunks = []
for chunk in pd.read_csv("cleaned_ecommerce_data.csv", usecols=selected_cols, chunksize=chunk_size):
    chunks.append(chunk.sample(frac=0.1))  # Load only 10% of each chunk (adjust as needed)

df = pd.concat(chunks, ignore_index=True)


print(df.info())  # Check data types & missing values
print(df.describe())  # Summary stats for numerical columns
print(df["event_type"].value_counts())  # Count of each event type


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6740146 entries, 0 to 6740145
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   event_time  object 
 1   event_type  object 
 2   product_id  int64  
 3   brand       object 
 4   price       float64
dtypes: float64(1), int64(1), object(3)
memory usage: 257.1+ MB
None
         product_id         price
count  6.740146e+06  6.740146e+06
mean   1.251059e+07  2.924528e+02
std    1.724944e+07  3.555930e+02
min    1.000978e+06  0.000000e+00
25%    1.305996e+06  6.924000e+01
50%    5.100572e+06  1.657700e+02
75%    1.730075e+07  3.603700e+02
max    1.000286e+08  2.574070e+03
event_type
view        6355885
cart         292489
purchase      91772
Name: count, dtype: int64
