In [6]:
import pandas as pd

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Display basic information about each dataset
customers_info = customers.info()
products_info = products.info()
transactions_info = transactions.info()

# Display the first few rows of each dataset
customers_head = customers.head()
products_head = products.head()
transactions_head = transactions.head()

customers_info, customers_head, products_info, products_head, transactions_info, transactions_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  


(None,
   CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
 None,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
 None,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  20

In [7]:
# Check for missing values and duplicates
missing_values = {
    "Customers": customers.isnull().sum(),
    "Products": products.isnull().sum(),
    "Transactions": transactions.isnull().sum()
}

duplicates = {
    "Customers": customers.duplicated().sum(),
    "Products": products.duplicated().sum(),
    "Transactions": transactions.duplicated().sum()
}

# Analyze regional distribution of customers
regional_distribution = customers['Region'].value_counts()

# Analyze product category distribution
category_distribution = products['Category'].value_counts()

# Analyze transaction trends
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
transactions['TransactionMonth'] = transactions['TransactionDate'].dt.to_period('M')
monthly_revenue = transactions.groupby('TransactionMonth')['TotalValue'].sum()

# Identify top customers by total spending
top_customers = transactions.groupby('CustomerID')['TotalValue'].sum().sort_values(ascending=False).head(10)

# Identify most popular products by quantity sold
popular_products = transactions.groupby('ProductID')['Quantity'].sum().sort_values(ascending=False).head(10)

missing_values, duplicates, regional_distribution, category_distribution, monthly_revenue.head(), top_customers, popular_products


({'Customers': CustomerID      0
  CustomerName    0
  Region          0
  SignupDate      0
  dtype: int64,
  'Products': ProductID      0
  ProductName    0
  Category       0
  Price          0
  dtype: int64,
  'Transactions': TransactionID      0
  CustomerID         0
  ProductID          0
  TransactionDate    0
  Quantity           0
  TotalValue         0
  Price              0
  dtype: int64},
 {'Customers': 0, 'Products': 0, 'Transactions': 0},
 Region
 South America    59
 Europe           50
 North America    46
 Asia             45
 Name: count, dtype: int64,
 Category
 Books          26
 Electronics    26
 Clothing       25
 Home Decor     23
 Name: count, dtype: int64,
 TransactionMonth
 2023-12     3769.52
 2024-01    66376.39
 2024-02    51459.27
 2024-03    47828.73
 2024-04    57519.06
 Freq: M, Name: TotalValue, dtype: float64,
 CustomerID
 C0141    10673.87
 C0054     8040.39
 C0065     7663.70
 C0156     7634.45
 C0082     7572.91
 C0188     7111.32
 C0059     70