# A set of practice problems
## Note 1: These questions are for your practice and are not a 100% representitive of topics we have covered in the class.
## Note 2: If you want to practice merging dataframes, please refer to the practice problems I have posted under week 7.

# Consider the following dictionaries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import pingouin as pg

sales_data = {
    'Order_ID': [101, 102, 103, 104, 105],
    'Product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Tablet'],
    'Category': ['Electronics', 'Accessories', 'Accessories', 'Electronics', 'Electronics'],
    'Quantity': [1, 2, 1, 3, 1],
    'Order_Date': ['2023-01-15', '2023-02-20', '2023-03-10', '2023-04-18', '2023-05-05'],
    'Price': [1000, 25, 45, 200, 500]
}

customer_data = {
    'Customer_ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Country': ['US', 'Canada', 'US', 'UK', None],
    'Age': [25, 34, None, 29, 22],
    'Loyalty_Score': [4.5, None, 3.8, 4.0, 4.7],
}

shipment_data = {
    'Order_ID': [101, 102, 106, 107, 108],
    'Shipment_Date': ['2023-01-17', '2023-02-22', '2023-06-15', '2023-07-20', '2023-08-25'],
    'Shipping_Cost': [15.5, 12.0, 18.0, 22.5, 10.0],
    'Carrier': ['FedEx', 'UPS', 'DHL', 'FedEx', 'UPS']
}


### Create a DataFrame `df_sales` from the `sales_data` dictionary. Display the first three rows.

In [None]:
df_sales = pd.DataFrame(sales_data)
df_sales.head(3)

### Display the summary information of the `df_sales` DataFrame.

In [None]:
df_sales.info()

### Use the `describe` method to display statistical summary information for the numerical columns in `df_sales`.

In [None]:
df_sales.describe()

### Convert the `Order_Date` column in `df_sales` to a pandas datetime format. Verify the updated data type.

In [None]:
df_sales['Order_Date'] = pd.to_datetime(df_sales['Order_Date'])
df_sales.dtypes

### Rename the column `Price` in `df_sales` to `Unit_Price`.

In [None]:
df_sales.rename(columns={'Price': 'Unit_Price'}, inplace=True)
df_sales

### Transpose the `df_sales` DataFrame and display the result.

In [None]:
df_sales.T

### Find the number of unique values in the `Category` column of `df_sales`.

In [None]:
df_sales['Category'].nunique()

### Count the frequency of each product in the `Product` column of `df_sales`.

In [None]:
df_sales['Product'].value_counts()

### Group the `df_sales` DataFrame by `Category` and calculate the total `Quantity` for each category.

In [None]:
df_sales.groupby('Category')['Quantity'].sum()

### Create a pivot table to calculate the average `Price` for each `Category` and `Product`.

In [None]:
df_sales.pivot_table(values='Unit_Price', 
                     index='Category', 
                     columns='Product', 
                     aggfunc='mean')

### Filter rows in `df_sales` where the `Quantity` is greater than 1.

In [None]:
df_sales[df_sales['Quantity'] > 1]

### Create a bar plot showing total `Price` for each `Category`.

In [None]:
df_sales.groupby('Category')['Unit_Price'].sum().plot(kind='bar')

### Use the `plot` function in pandas to create a line plot showing `Quantity` over `Order_Date`.

In [None]:
df_sales.set_index('Order_Date')['Quantity'].plot(kind='line')

### Use seaborn to create a scatter plot showing the relationship between `Price` and `Quantity` in `df_sales`.

In [None]:
sns.scatterplot(x='Unit_Price', y='Quantity', data=df_sales)

### Create dummy variables for the `Category` column in `df_sales`.

In [None]:
pd.get_dummies(df_sales, columns=['Category'])

### Merge `df_sales` and `shipment_data` on the `Order_ID` column and display the resulting DataFrame.

In [None]:
df_shipment = pd.DataFrame(shipment_data)
df_sales.merge(df_shipment, 
               on = 'Order_ID',
               how = 'left',
               validate='1:1')
# Note that in this special dataset, each sales has a unique shipment date
# that's why we have 1:1. In practice this could be more complicated


### Concatenate `df_sales` and `customer_data` along the rows axis and display the resulting DataFrame. Explore the resulting dataframe and explain whether this is a reasonable concatenation.

In [None]:
df_customer = pd.DataFrame(customer_data)
pd.concat([df_sales, df_customer])

### Filter rows in `df_sales` using a query to select orders with `Price` greater than 100 and `Category` as 'Electronics'.

In [None]:
df_sales.query('Unit_Price > 100 and Category == "Electronics"')

### Identify columns with missing values in the `customer_data` DataFrame.

In [None]:
#df_customer[df_customer.isna().sum().reset_index()[0].ge(1)]
# note that my above solution is confusing at first. It is a big
# difficult to understand it first. I put it
# here to push you to run it step by step to see how it works
# do the following if you want to understand it.
# run the following step by step to see the output of each step
  #df_customer.isna().sum()
  #df_customer.isna().sum().reset_index()
  #df_customer.isna().sum().reset_index()[0]
  # df_customer.isna().sum().reset_index()[0].ge(1)

# an easier (sort of) solution would be the following
temp = df_customer.isna().sum()
temp[temp.ge(1)].index.to_list()

### Fill missing values in the `Age` column of `customer_data` with the mean age.

In [None]:
df_customer['Age'].fillna(df_customer['Age'].mean(), inplace=True)
df_customer

### Drop rows in `customer_data` where the `Country` column has missing values.

In [None]:
df_customer.dropna(subset=['Country'], inplace=True)
df_customer

### Sort the `df_sales` DataFrame by the `Price` column in ascending order.

In [None]:
df_sales.sort_values(by='Unit_Price', ascending=True)

### Drop duplicate rows in `df_sales` based on the `Product` column.

In [None]:
df_sales.drop_duplicates(subset=['Product'])

### Sort the `customer_data` DataFrame by the `Loyalty_Score` column in descending order.

In [None]:
df_customer.sort_values(by='Loyalty_Score', ascending=False)

### Calculate the total `Price` in the `df_sales` DataFrame.

In [None]:
df_sales['Unit_Price'].sum()

### Compute the average `Age` in `customer_data`.

In [None]:
df_customer['Age'].mean()

### Calculate the standard deviation of the `Shipping_Cost` column in `shipment_data`.

In [None]:
round(df_shipment['Shipping_Cost'].std(), 2)

### Add a new column `Total_Cost` to `df_sales` as the product of `Price` and `Quantity`.

In [None]:
df_sales['Total_Cost'] = df_sales['Unit_Price'] * df_sales['Quantity']
df_sales

### Use the `duplicated` method to check for duplicate rows in `shipment_data`.

In [None]:
df_shipment.duplicated()

### Drop rows from `shipment_data` that are duplicates based on the `Order_ID` column.

In [None]:
df_shipment.drop_duplicates(subset=['Order_ID'], inplace=True)
df_shipment

### Create a new DataFrame `df_orders` containing only the rows from `df_sales` with an `Order_Date` after '2023-03-01'.

In [None]:
df_orders = df_sales[df_sales['Order_Date'] > '2023-03-01']
df_orders

### Plot a histogram of the `Age` column in `customer_data` using matplotlib.

In [None]:
df_customer['Age'].plot(kind='hist')
# note that histogram is very simplistic because you have a small dataset


### Create a box plot using seaborn to visualize the distribution of `Price` in `df_sales` grouped by `Category`.

In [None]:
sns.boxplot(x='Category', y='Unit_Price', data=df_sales)

### Use the `query` method to select rows from `df_sales` where the `Order_Date` is in January 2023.

In [None]:
df_sales.query('Order_Date.dt.month == 1 and Order_Date.dt.year == 2023')

### Calculate the sum of `Shipping_Cost` grouped by `Carrier` in `shipment_data`.

In [None]:
df_shipment.groupby('Carrier')['Shipping_Cost'].sum()

### Merge `df_sales` and `shipment_data` on the `Order_ID` column and calculate the total `Shipping_Cost` for each `Category`.

In [None]:
merged = pd.merge(df_sales, df_shipment, 
                  on='Order_ID',
                  how = 'inner',
                 validate='1:1')

merged.groupby('Category')['Shipping_Cost'].sum()

### Perform a linear regression using `pingouin` to predict `Price` based on `Quantity` in `df_sales`. Interpret the results

In [None]:
import pingouin as pg
pg.linear_regression(df_sales['Quantity'], df_sales['Unit_Price'])

### Use query to select rows in df_sales where Order_Date is between '2023-01-01' and '2023-04-01', and Price is in the top 10% of all prices.

In [None]:
df_sales['Order_Date'] = pd.to_datetime(df_sales['Order_Date'])

top_10_percent_price = df_sales['Unit_Price'].quantile(0.9)
result = df_sales.query("Order_Date >= '2023-01-01' and Order_Date <= '2023-04-01' and Unit_Price >= @top_10_percent_price")

display(result)


# Identify outliers in the Shipping_Cost column of shipment_data using the IQR method.

In [None]:
Q1 = df_shipment['Shipping_Cost'].quantile(0.25)
Q3 = df_shipment['Shipping_Cost'].quantile(0.75)
IQR = Q3 - Q1
outliers = df_shipment[(df_shipment['Shipping_Cost'] < (Q1 - 1.5 * IQR)) | (df_shipment['Shipping_Cost'] > (Q3 + 1.5 * IQR))]

