In [1]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Generate sample sales data
dates = pd.date_range('2023-01-01', periods=100, freq='D')
products = ['Laptop', 'Phone', 'Tablet', 'Headphones', 'Monitor']
categories = ['Electronics', 'Electronics', 'Electronics', 'Accessories', 'Electronics']
sales_data = {
    'Date': np.random.choice(dates, 100),
    'Product': np.random.choice(products, 100),
    'Category': [categories[products.index(p)] for p in np.random.choice(products, 100)],
    'Sales': np.random.randint(100, 1000, 100),
    'Quantity': np.random.randint(1, 10, 100),
    'Region': np.random.choice(['North', 'South', 'East', 'West'], 100)
}
df_sales = pd.DataFrame(sales_data)

print("Sales Dataset:")
print(df_sales.head())
print("\nData types:")
print(df_sales.dtypes)

Sales Dataset:
        Date     Product     Category  Sales  Quantity Region
0 2023-02-21       Phone  Electronics    982         3  North
1 2023-04-03      Laptop  Electronics    569         7   East
2 2023-01-15  Headphones  Electronics    474         2   West
3 2023-03-13  Headphones  Electronics    121         2  South
4 2023-03-02  Headphones  Electronics    849         7  North

Data types:
Date        datetime64[ns]
Product             object
Category            object
Sales                int64
Quantity             int64
Region              object
dtype: object


## Exercise 1: Creating Series and DataFrames
Create a Pandas Series from the 'Sales' column. Then create a new DataFrame with only 'Product' and 'Sales' columns.

```python
# Your code here
# sales_series = 
# df_subset = 
```

**Hint:** Use `pd.Series()` and `df[['col1', 'col2']]`.

In [None]:
# Your code here
# sales_series = 
# df_subset = 

## Exercise 2: Reading and Writing Data
Save the DataFrame to a CSV file named 'sales_data.csv'. Then read it back into a new DataFrame.

```python
# Your code here
# df_sales.to_csv('sales_data.csv', index=False)
# df_loaded = 
```

**Hint:** Use `to_csv()` and `pd.read_csv()`.

In [None]:
# Your code here
# df_sales.to_csv('sales_data.csv', index=False)
# df_loaded = 

## Exercise 3: Data Inspection
Inspect the DataFrame: print the shape, info, describe statistics, and first/last 5 rows.

```python
# Your code here
```

**Hint:** Use `shape`, `info()`, `describe()`, `head()`, `tail()`.

In [None]:
# Your code here

## Exercise 4: Indexing and Selection
- Select the 'Sales' column.
- Select rows where Region is 'North'.
- Use .loc to select the first 10 rows and 'Product' and 'Sales' columns.

```python
# Your code here
```

**Hint:** Use `df['col']`, boolean indexing, and `.loc[row_indexer, col_indexer]`.

In [None]:
# Your code here

## Exercise 5: Data Cleaning
Introduce some NaN values in the 'Sales' column. Then fill them with the mean and drop rows with NaN in 'Quantity'.

```python
# Your code here
# df_sales.loc[0:4, 'Sales'] = np.nan
# df_filled = 
# df_dropped = 
```

**Hint:** Use `fillna()` and `dropna()`.

In [None]:
# Your code here
# df_sales.loc[0:4, 'Sales'] = np.nan
# df_filled = 
# df_dropped = 

## Exercise 6: Data Operations
Add a 'Total' column (Sales * Quantity). Compute value counts for 'Product' and sum of 'Sales'.

```python
# Your code here
# df_sales['Total'] = 
# product_counts = 
# total_sales = 
```

**Hint:** Use vectorized operations and `value_counts()`, `sum()`.

In [None]:
# Your code here
# df_sales['Total'] = 
# product_counts = 
# total_sales = 

## Exercise 7: Grouping and Aggregation
Group by 'Category' and compute mean 'Sales'. Group by 'Region' and sum 'Quantity'.

```python
# Your code here
# category_mean_sales = 
# region_total_quantity = 
```

**Hint:** Use `groupby()` with `mean()` and `sum()`.

In [None]:
# Your code here
# category_mean_sales = 
# region_total_quantity = 

## Exercise 8: Merging and Joining
Create a small DataFrame with product prices. Merge it with df_sales on 'Product'.

```python
# Your code here
# prices_df = pd.DataFrame({'Product': ['Laptop', 'Phone'], 'Price': [1000, 500]})
# df_merged = 
```

**Hint:** Use `pd.merge()` with `on='Product'`.

In [None]:
# Your code here
# prices_df = pd.DataFrame({'Product': ['Laptop', 'Phone'], 'Price': [1000, 500]})
# df_merged = 

## Exercise 9: Time Series
Set 'Date' as index and resample monthly to sum 'Sales'. Find the date with max sales.

```python
# Your code here
# df_ts = df_sales.set_index('Date')
# monthly_sales = 
# max_date = 
```

**Hint:** Use `set_index()`, `resample('M').sum()`, and `idxmax()`.

In [None]:
# Your code here
# df_ts = df_sales.set_index('Date')
# monthly_sales = 
# max_date = 

## Exercise 10: Visualization
Plot a bar chart of total sales by category and a line plot of sales over time.

```python
# Your code here
# df_sales.groupby('Category')['Sales'].sum().plot(kind='bar')
# df_ts['Sales'].plot()
```

**Hint:** Use `plot(kind='bar')` and `plot()` on Series.

In [None]:
# Your code here
# df_sales.groupby('Category')['Sales'].sum().plot(kind='bar')
# df_ts['Sales'].plot()

## Exercise 11: Performance Tips
Compare vectorized sum vs. loop for summing 'Sales'. Use a larger DataFrame if needed.

```python
# Your code here
```

**Hint:** Time with `time.time()` and use vectorized operations.

In [None]:
# Your code here

## Cleanup
Remove any temporary files created.

```python
import os

for f in ["sales_data.csv"]:
    if os.path.exists(f):
        os.remove(f)
        print(f"Removed: {f}")

print("\nCleanup complete!")
```

In [None]:
import os

for f in ["sales_data.csv"]:
    if os.path.exists(f):
        os.remove(f)
        print(f"Removed: {f}")

print("\nCleanup complete!")