In [6]:
import pandas as pd

# Define the paths to the CSV files
nov_csv_path = '/Users/elenacellitti/Downloads/ecommerce_data/ecommerce_nov_19.csv'
oct_csv_path = '/Users/elenacellitti/Downloads/ecommerce_data/ecommerce_oct_19.csv'

# Load the CSV files into pandas DataFrames
nov_df = pd.read_csv(nov_csv_path)
oct_df = pd.read_csv(oct_csv_path)

# Extract month from the "event_time" column
nov_df['month'] = pd.to_datetime(nov_df['event_time']).dt.month_name()
oct_df['month'] = pd.to_datetime(oct_df['event_time']).dt.month_name()

# Concatenate the DataFrames
combined_df = pd.concat([nov_df, oct_df], ignore_index=True)

# Display the first few rows of the combined DataFrame
print("Combined DataFrame:")
print(combined_df.head())

Combined DataFrame:
                event_time event_type  product_id          category_id  \
0  2019-11-01 00:00:00 UTC       view     1003461  2053013555631882655   
1  2019-11-01 00:00:00 UTC       view     5000088  2053013566100866035   
2  2019-11-01 00:00:01 UTC       view    17302664  2053013553853497655   
3  2019-11-01 00:00:01 UTC       view     3601530  2053013563810775923   
4  2019-11-01 00:00:01 UTC       view     1004775  2053013555631882655   

               category_code   brand   price    user_id  \
0     electronics.smartphone  xiaomi  489.07  520088904   
1  appliances.sewing_machine  janome  293.65  530496790   
2                        NaN   creed   28.31  561587266   
3  appliances.kitchen.washer      lg  712.87  518085591   
4     electronics.smartphone  xiaomi  183.27  558856683   

                           user_session     month  
0  4d3b30da-a5e4-49df-b1a8-ba5943f1dd33  November  
1  8e5f4f83-366c-4f70-860e-ca7417414283  November  
2  755422e7-9040-477b-9b

In [14]:
# Display the first few rows of the DataFrame
# print("First few rows of the DataFrame:")
# print(combined_df.head())

# # Get a concise summary of the DataFrame including data types and missing values
# print("\nSummary of the DataFrame:")
# print(combined_df.info())

# Check for missing values in the DataFrame
print("\nMissing values in the DataFrame:")
print(combined_df.isnull().sum())



Missing values in the DataFrame:
event_time              0
event_type              0
product_id              0
category_id             0
category_code    35413780
brand            15341158
price                   0
user_id                 0
user_session           12
month                   0
dtype: int64


In [16]:
# A lot of null values were found for category_code and brand that aren't useful for my analysis. For this reason I'm dropping these columns, also as it makes things easier 

# Drop columns with a lot of null values
combined_df.drop(columns=['category_code', 'brand'], inplace=True)

# Display the updated DataFrame
print("Updated DataFrame after dropping columns:")
print(combined_df.head())

Updated DataFrame after dropping columns:
                event_time event_type  product_id          category_id  \
0  2019-11-01 00:00:00 UTC       view     1003461  2053013555631882655   
1  2019-11-01 00:00:00 UTC       view     5000088  2053013566100866035   
2  2019-11-01 00:00:01 UTC       view    17302664  2053013553853497655   
3  2019-11-01 00:00:01 UTC       view     3601530  2053013563810775923   
4  2019-11-01 00:00:01 UTC       view     1004775  2053013555631882655   

    price    user_id                          user_session     month  
0  489.07  520088904  4d3b30da-a5e4-49df-b1a8-ba5943f1dd33  November  
1  293.65  530496790  8e5f4f83-366c-4f70-860e-ca7417414283  November  
2   28.31  561587266  755422e7-9040-477b-9bd2-6a6e8fd97387  November  
3  712.87  518085591  3bfb58cd-7892-48cc-8020-2f17e6de6e7f  November  
4  183.27  558856683  313628f1-68b8-460d-84f6-cec7a8796ef2  November  


In [18]:
# To understand what events I have, this will print a unique list of them
# Get unique values of the 'event_type' column
event_types = combined_df['event_type'].unique()

# Display the unique event types
print("Unique event types:")
print(event_types)

# Unique event types: ['view' 'cart' 'purchase']

Unique event types:
['view' 'cart' 'purchase']
