In [2]:
# First, let's install/upgrade required packages
!pip install --upgrade seaborn matplotlib pandas numpy

# Now import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plotting style - using a different style that's definitely available
plt.style.use('default')  # This is safer than 'seaborn' style
%matplotlib inline

# Configure seaborn style directly
sns.set_theme()  # This is the better way to set seaborn styling

print('Libraries imported successfully!')

Libraries imported successfully!


In [5]:
# Load and examine the dataset
# Read the events data
events_df = pd.read_csv('../data/events.csv')

# Display basic information about the dataset
print("Dataset Overview:")
print("-" * 50)
print(f"Number of rows: {events_df.shape[0]:,}")
print(f"Number of columns: {events_df.shape[1]}")
print("\nColumns in the dataset:")
print(events_df.columns.tolist())

# Display first few rows
print("\nFirst few rows of the data:")
display(events_df.head())

# Check for missing values
print("\nMissing values in the dataset:")
display(events_df.isnull().sum())

# Get event type distribution
print("\nEvent type distribution:")
event_counts = events_df['event'].value_counts()  # Changed from 'event_type' to 'event'
display(event_counts)
print("\nPercentage distribution:")
display(event_counts / len(events_df) * 100)

# Let's also convert the timestamp to datetime for better analysis
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'], unit='ms')
print("\nTimestamp range:")
print(f"Start: {events_df['timestamp'].min()}")
print(f"End: {events_df['timestamp'].max()}")

Dataset Overview:
--------------------------------------------------
Number of rows: 2,756,101
Number of columns: 5

Columns in the dataset:
['timestamp', 'visitorid', 'event', 'itemid', 'transactionid']

First few rows of the data:


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,



Missing values in the dataset:


timestamp              0
visitorid              0
event                  0
itemid                 0
transactionid    2733644
dtype: int64


Event type distribution:


event
view           2664312
addtocart        69332
transaction      22457
Name: count, dtype: int64


Percentage distribution:


event
view           96.669607
addtocart       2.515583
transaction     0.814810
Name: count, dtype: float64


Timestamp range:
Start: 2015-05-03 03:00:04.384000
End: 2015-09-18 02:59:47.788000
