# Task 1: Pandas Fundamentals & Exploration

## Part A: Data Loading

In [1]:
import pandas as pd
import numpy as np

customers_df = pd.read_csv('customers.csv')
products_df = pd.read_csv('products.csv')  
transactions_df = pd.read_csv('transactions.csv')


## Part B: Data Exploration

### 1. Basic Information

In [2]:
# first 5 rows of each dataframe

print(customers_df.head())
print("__"  *40)
print(products_df.head())
print("__"  *40)
print(transactions_df.head())
print("==" *40)

# last 5 rows of each dataframe

print(customers_df.tail())
print("__"  *40)
print(products_df.tail())
print("__"  *40)
print(transactions_df.tail())
print("==" *40)

# shape of each dataframe
print("Customers shape:", customers_df.shape)
print("Products shape:", products_df.shape)
print("Transactions shape:", transactions_df.shape)
print("==" *40)


# column names, data types and memory usage of each dataframe
print("Customers info:")
print(customers_df.info())
print("__"  *40)
print("Products info:")
print(products_df.info())
print("__"  *40)
print("Transactions info:")
print(transactions_df   .info())
print("==" *40)






  customer_id            name                     email registration_date  \
0        C001     Logan Brown                       NaN        2024-01-01   
1        C002  John Rodriguez   emma.johnson1@email.com        2024-01-02   
2        C003       Ava Davis                       NaN        2024-01-04   
3        C004   William Brown                       NaN        2024-01-06   
4        C005   Abigail Moore  william.jones4@email.com        2024-01-08   

     country age  
0     Canada  39  
1     France  28  
2  Australia  65  
3      Italy  33  
4     Canada  50  
________________________________________________________________________________
  product_id   product_name     category   price  stock
0       P001        Speaker  Electronics  353.96     15
1       P002   Science Book        Books   34.88  11246
2       P003        Sweater     Clothing   23.53     97
3       P004     Smartphone  Electronics   56.05     86
4       P005  Running Shoes       sports -339.29     50
______

### 2. Statistical Summary

In [3]:
# describing numerical columns in each dataframe


# customers dataframe has no numerical columns to describe
# we could have used describe(include='number') to get only numerical columns
print("Products statistical summary:")
print(products_df[['price','stock']].describe())
print("__"  *40)
print("Transactions statistical summary:")
print(transactions_df['quantity'].describe())
print("==" *40)

# describing categorical columns in each dataframe
print("Customers categorical summary:")
print(customers_df.describe(include='object'))
print("__"  *40)
print("Products categorical summary:")
print(products_df.describe(include='object'))
print("__"  *40)
print("Transactions categorical summary:")
print(transactions_df.describe(include='object'))
print("=="*40)

# identifying unique value counts for categorical columns in each dataframe
print("Customers unique value counts:")
print(customers_df.select_dtypes(include='object').nunique())
print("__"  *40)
print("Products unique value counts:")
print(products_df.select_dtypes(include='object').nunique())
print("__"  *40)
print("Transactions unique value counts:")
print(transactions_df.select_dtypes(include='object').nunique())
print("==" *40)





Products statistical summary:
            price         stock
count   47.000000     50.000000
mean   165.770638    615.700000
std    200.448722   2328.207581
min   -488.170000      0.000000
25%     41.880000     21.000000
50%    179.690000     61.000000
75%    298.270000     89.750000
max    491.100000  11246.000000
________________________________________________________________________________
Transactions statistical summary:
count    492.000000
mean       3.002033
std        1.410607
min        1.000000
25%        2.000000
50%        3.000000
75%        4.000000
max        5.000000
Name: quantity, dtype: float64
Customers categorical summary:
       customer_id            name                      email  \
count          205             205                        185   
unique         200             160                        180   
top           C115  Noah Rodriguez  emily.johnson47@email.com   
freq             2               5                          2   

       registration

### 3. Data Quality Check

In [4]:
# checking for missing values in each dataframe

print("Customers missing values:")
print(customers_df.isnull().sum())
print("__"  *40)
print("Products missing values:")
print(products_df.isnull().sum())
print("__" * 40)
print("Transactions missing values:")
print(transactions_df.isnull().sum())
print("==" *40)

# checking for duplicate rows in each dataframe

print("Customers duplicate rows:", customers_df.duplicated().sum())
print("Products duplicate rows:", products_df.duplicated().sum())
print("Transactions duplicate rows:", transactions_df.duplicated().sum())
print("==" *40)

# identifying unusual values or patterns

# checking that age is between 0 and 100 in customers dataframe and registration_date is not in the future
# as ages column is of object type, we need to check if every entry is numeric and between 0 and 100
invalid_ages = customers_df[(customers_df['age'] < '0') | (customers_df['age'] > '100')]
print("Invalid ages in customers dataframe:")
print(invalid_ages)



Customers missing values:
customer_id           0
name                  0
email                20
registration_date     0
country               0
age                   0
dtype: int64
________________________________________________________________________________
Products missing values:
product_id      0
product_name    0
category        0
price           3
stock           0
dtype: int64
________________________________________________________________________________
Transactions missing values:
transaction_id       0
customer_id          0
product_id           0
quantity            16
transaction_date     0
payment_method       0
dtype: int64
Customers duplicate rows: 4
Products duplicate rows: 0
Transactions duplicate rows: 6
Invalid ages in customers dataframe:
    customer_id            name                           email  \
0          C001     Logan Brown                             NaN   
1          C002  John Rodriguez         emma.johnson1@email.com   
2          C003       A