In [83]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Setup
companies = ['AlphaCorp', 'BetaLtd', 'GammaInc', 'DeltaTech', 'EpsilonCo']
dates = pd.date_range(start='2023-01-01', periods=20, freq='B')  # 20 business days
df = pd.DataFrame({
    'Company': np.random.choice(companies, 100),
    'Date': np.random.choice(dates, 100),
    'Open': np.random.uniform(100, 500, 100).round(2),
    'High': np.random.uniform(105, 510, 100).round(2),
    'Low': np.random.uniform(95, 495, 100).round(2),
    'Close': np.random.uniform(100, 500, 100).round(2),
    'Volume': np.random.randint(10000, 100000, 100)
})

In [84]:
# Inject missing values & duplicates
df.loc[np.random.choice(df.index, 8), 'Close'] = np.nan
df.loc[np.random.choice(df.index, 5), 'Company'] = np.nan
df = pd.concat([df, df.sample(5, random_state=1)], ignore_index=True)  # Add duplicates

# Display sample
df.head()

Unnamed: 0,Company,Date,Open,High,Low,Close,Volume
0,DeltaTech,2023-01-10,434.12,405.01,249.44,249.95,65393
1,EpsilonCo,2023-01-12,228.31,387.29,479.48,214.28,83553
2,GammaInc,2023-01-11,174.61,389.51,457.14,447.44,16941
3,EpsilonCo,2023-01-17,116.31,250.59,173.32,189.44,35934
4,EpsilonCo,2023-01-03,336.36,223.9,122.74,485.29,33386


# 1. Data Cleaning Exercises

## 1. Identify all missing values.

In [85]:
missing_values = df.isna().sum()
missing_values

Unnamed: 0,0
Company,5
Date,0
Open,0
High,0
Low,0
Close,8
Volume,0


## 2. Fill missing 'Close' values with forward fill grouped by Company.

In [86]:
df['Close'] = df.groupby('Company')['Close'].ffill()

## 3. Drop rows where 'Company' is missing.

In [87]:
df = df.dropna(subset=['Company'])

## 4. Detect and remove exact duplicate rows.

In [88]:
df = df.drop_duplicates()

## 5. Create a daily_range = High - Low column and check for negative or zero values.

In [89]:
df['daily_range'] = df['High'] - df['Low']
invalid_range = df[df['daily_range'] <= 0]

# 2. MultiIndex Exercises

## 1. Create a MultiIndex on ['Company', 'Date'].

In [90]:
df_mi = df.set_index(['Company', 'Date']).sort_index()

## 2. Slice all records for 'DeltaTech'.

In [91]:
delta_df = df_mi.loc['DeltaTech']

## 3. Slice data for 'BetaLtd' between 2023-01-05 and 2023-01-15.

In [92]:
beta_df = df_mi.loc[('BetaLtd', '2023-01-05'):('BetaLtd', '2023-01-15')]

## 4. Use .xs() to extract data for a specific date across all companies.

In [93]:
specific_date = df_mi.xs('2023-01-10', level='Date')

## 5. Swap index levels and sort by Date first, then Company.

In [94]:
swapped = df_mi.swaplevel().sort_index()

# 3. Index Manipulation Exercises

## 1. Set 'Date' as index, keeping it as a column too.

In [95]:
df = df.set_index('Date', drop=False)

## 2. Reset index completely after filtering outliers.

In [96]:
filtered = df[df['Volume'] > 50000].reset_index(drop=True)

## 3. Use .reindex() to align to full trading calendar for January.

In [97]:
df_re = df.drop_duplicates()
df_re = df_re[~df_re.index.duplicated(keep='first')]
full_calendar = pd.date_range('2023-01-01', '2023-01-31', freq='B')
df_reindexed = df_re.reindex(full_calendar)

## 4. Sort your index and use .loc[] to slice by date.

In [100]:
df = df.sort_index()
jan_5_to_10 = df.loc['2023-01-05':'2023-01-10']

## 5. Rebuild MultiIndex with ['Date', 'Company'] and forward-fill missing Close.

In [101]:
df_reset = df.reset_index(drop=True)
df_mi2 = df_reset.set_index(['Date', 'Company']).sort_index()
df_mi2['Close'] = df_mi2['Close'].ffill()

# 4. Aggregation with .agg()

## 1. For each company, compute mean Open, max High, min Low.

In [102]:
summary = df.groupby('Company').agg({
    'Open': 'mean',
    'High': 'max',
    'Low': 'min'
})

## 2. Create custom function to return (High + Low) / 2 and apply across the DataFrame.

In [103]:
def midpoint(row):
    return (row['High'] + row['Low']) / 2
df['Mid'] = df.apply(midpoint, axis=1)

## 3. Group by Company and get Volume sum and Close std using .agg().

In [104]:
vol_close = df.groupby('Company').agg({
    'Volume': 'sum',
    'Close': 'std'
})

## 4. Calculate average volume per company per day.

In [106]:
df = df.drop(columns='Date')
avg_vol = df.groupby(['Company', 'Date'])['Volume'].mean()

## 5. Aggregate Company-wise trading ranges using a lambda: High - Low.

In [109]:
range_agg = df.groupby('Company').agg(trading_range=('High', lambda x: x.max() - x.min()))
range_agg

Unnamed: 0_level_0,trading_range
Company,Unnamed: 1_level_1
AlphaCorp,370.55
BetaLtd,375.31
DeltaTech,376.59
EpsilonCo,336.19
GammaInc,368.44


# 5. GroupBy Exercises

## 1. Total Volume per Company.

In [111]:
total_vol = df.groupby('Company')['Volume'].sum()

## 2. Average Close price per trading day.

In [112]:
avg_close = df.groupby('Date')['Close'].mean()

## 3. For each Company, show top trading day by Volume.

In [113]:
top_day = df.loc[df.groupby('Company')['Volume'].idxmax()]

## 4. Count trading days per company with Close over 300.

In [115]:
count_high_close = df[df['Close'] > 300].groupby('Company').size()

## 5. Group by Date, get company with lowest Close each day.

In [116]:
lowest_close = df.loc[df.groupby('Date')['Close'].idxmin()]

# 6. Filtering with .filter() and Conditions

## 1. Companies with average Close > 400.

In [117]:
high_avg_close = df.groupby('Company').filter(lambda x: x['Close'].mean() > 400)

## 2. Keep only trading days where total volume > 300,000 across all companies.

In [118]:
volume_days = df.groupby('Date').filter(lambda x: x['Volume'].sum() > 300000)

## 3. Filter out companies with less than 3 trading records.

In [119]:
active_companies = df.groupby('Company').filter(lambda x: len(x) >= 3)

## 4. Find days where Open < Close by at least 10% (bullish days).

In [120]:
bullish = df[df['Close'] >= 1.1 * df['Open']]

## 5. Use groupby().filter() to find companies where High exceeds 500 on any day.

In [121]:
high_spike = df.groupby('Company').filter(lambda x: (x['High'] > 500).any())

# 7. Slicing Exercises

## 1. Using .loc[] slice rows for 'AlphaCorp' from '2023-01-03' to '2023-01-10'.

In [124]:
df_reset = df.reset_index()
df_mi = df_reset.set_index(['Company', 'Date']).sort_index()

alpha_slice = df_mi.loc[('AlphaCorp', '2023-01-03'):('AlphaCorp', '2023-01-10')]

## 2. Use .iloc[] to get the last 10 rows of the cleaned dataset.

In [125]:
last_10 = df.iloc[-10:]

## 3. Extract all rows where Close is in top 10%.

In [126]:
top_10_percent = df[df['Close'] >= df['Close'].quantile(0.9)]

## 4. Slice Close prices only for 'GammaInc' using .xs() from MultiIndex.

In [127]:
gamma_close = df_mi.xs('GammaInc')['Close']

## 5. Chain slicing + filtering to find days where range (High - Low) > 100 and Volume > 80,000.

In [128]:
big_days = df[(df['High'] - df['Low'] > 100) & (df['Volume'] > 80000)]