# DATA 304 â€“ Module 3, Session 1 Demo
Flat files, paths, CSV/Excel, compression, and large-file strategies.

## Raw strings

In [None]:
my_path = "C:\Users\data\file.cs"
print(my_path)

In [None]:
my_path_fixed = "C:\\Users\\data\\file.cs"
print(my_path_fixed)

In [None]:
my_path_raw = r"C:\Users\data\file.cs"
print(my_path_raw)

## Paths with `pathlib`
- Use relative paths for portability
- Avoid hard-coded OS-specific separators
- Prefer `Path` arithmetic

In [None]:
from pathlib import Path
ROOT_DIR = Path("..")
CUR_DIR1 = ROOT_DIR / "Module03"
CUR_DIR2 = Path(".")
DATA_DIR = CUR_DIR1 / "data"

In [None]:
print("Root directory is:\t", ROOT_DIR)
print("Current directory is:\t", CUR_DIR1)
print("Current directory is:\t", CUR_DIR2)
print("Data directory is:\t", DATA_DIR)

In [None]:
print(type(ROOT_DIR))
print(type(CUR_DIR1))
print(type(CUR_DIR2))
print(type(DATA_DIR))

In [None]:
print("Root directory is:\t", ROOT_DIR.resolve())
print("Current directory is:\t", CUR_DIR1.resolve())
print("Current directory is:\t", CUR_DIR2.resolve())
print("Data directory is:\t", DATA_DIR.resolve())

In [None]:
# Inspect files
list(CUR_DIR1.iterdir())

## Reading a clean CSV

In [None]:
import pandas as pd
clean_path = DATA_DIR / "clean_sales.csv"
df_clean = pd.read_csv(clean_path)
df_clean.head()

In [None]:
df_clean.info()

## CSV with semicolon delimiter and European decimals
- Use `sep=';'`
- Replace comma decimals and coerce to numeric
- Map NA tokens

In [None]:
messy_path = DATA_DIR / "messy_semicolon.csv"
df_messy = pd.read_csv(messy_path)
df_messy.head()

In [None]:
! head data/messy_semicolon.csv

In [None]:
df_messy = pd.read_csv(messy_path, sep=';', na_values=['NA','--'])
df_messy.head()

In [None]:
df_messy.dtypes

In [None]:
# Convert 'amount' from '19,99' style to float
df_messy['amount'] = (
    df_messy['amount']
      .astype(str)
      .str.replace(',', '.', regex=False)
      .astype(float)
)
df_messy

In [None]:
df_messy.dtypes

## Quoting and multiline fields
- Use `quotechar` and let pandas handle embedded commas and newlines

In [None]:
try:
    quoted_path = DATA_DIR / "multiline_quotes.csv"
    df_quotes = pd.read_csv(quoted_path)
except Exception as e:
    print("Error:", e)

In [None]:
! head data/multiline_quotes.csv

In [None]:
df_quotes = pd.read_csv(quoted_path, skiprows=1)
df_quotes

In [None]:
df_quotes = pd.read_csv(quoted_path, skiprows=1, quotechar="'")
df_quotes

## Excel with multiple sheets and junk rows
- Identify sheets
- Skip metadata rows
- Fix headers if needed

In [None]:
xls_path = DATA_DIR / "report.xlsx"
xe = pd.ExcelFile(xls_path)
xe.sheet_names

In [None]:
df_summary = xe.parse("Summary")
df_summary

In [None]:
df_summary = xe.parse("Summary", skiprows=3)
df_summary

In [None]:
# Read 'Summary' sheet, skipping top 3 junk rows
df_summary = pd.read_excel(xls_path, sheet_name="Summary", skiprows=3)
df_summary

In [None]:
df_raw = xe.parse(1)
df_raw

## Reading compressed CSV (gzip)
- Read directly from `.csv.gz` without extracting

In [None]:
! head data/events.csv.gz

In [None]:
! gzcat data/events.csv.gz 

In [None]:
gz_path = DATA_DIR / "events.csv.gz"
df_gz = pd.read_csv(gz_path, compression='gzip')
df_gz.head()

## Large file strategy with `chunksize`
- Stream rows in chunks
- Filter early to reduce memory
- Aggregate incrementally

In [None]:
! wc -l data/large_synthetic.csv

In [None]:
large_path = DATA_DIR / "large_synthetic.csv"
iter_chunks = pd.read_csv(large_path, chunksize=30000)
type(iter_chunks)

In [None]:
for chunk in iter_chunks:
    df = chunk
    print(df.shape)

In [None]:
for chunk in iter_chunks:
    df = chunk[chunk["flag"] == "A"]
    print(df.shape)

In [None]:
iter_chunks = pd.read_csv(large_path, chunksize=30000)
for chunk in iter_chunks:
    df = chunk[chunk["flag"] == "A"]
    print(df.shape)

In [None]:
iter_chunks = pd.read_csv(large_path, chunksize=30000)
df = pd.DataFrame()
for chunk in iter_chunks:
    df = pd.concat([df, chunk[chunk["flag"] == "A"]])
    print(df.shape)

## Memory inspection and dtype optimization
- Identify heavy columns
- Downcast numeric types
- Convert repeated strings to `category`

In [None]:
df_sample = pd.read_csv(large_path)
df_sample.memory_usage(deep=True)

In [None]:
# Optimize
df_opt = df_sample.copy()
df_opt['flag'] = df_opt['flag'].astype('category')
df_opt.memory_usage(deep=True)

In [None]:
df_opt['user_id'] = pd.to_numeric(df_opt['user_id'], downcast='unsigned')
df_opt.memory_usage(deep=True)

In [None]:
df_opt['value'] = pd.to_numeric(df_opt['value'], downcast='float')
df_opt.memory_usage(deep=True)

In [None]:
before = df_sample.memory_usage(deep=True).sum() / (1024**2)
after = df_opt.memory_usage(deep=True).sum() / (1024**2)
reduction = round(100*(1-(after/before)),2)
print(f"Size before optimization:\t{before:.2f} MB")
print(f"Size after optimization:\t{after:.2f} MB")
print(f"Reduction in size: \t\t{reduction:.2f}%")