# DATA 304 – Module 3, Session 2 DEMO
Encodings, malformed CSV rows, debugging workflow, and Excel pitfalls.
_Data folder: `session2_data/`_

In [1]:
from pathlib import Path
import pandas as pd
DATA_DIR = Path("data")
list(DATA_DIR.iterdir())

[PosixPath('data/large_synthetic.csv'),
 PosixPath('data/clean_sales.csv'),
 PosixPath('data/latin1_file.csv'),
 PosixPath('data/report.xlsx'),
 PosixPath('data/messy_semicolon.csv'),
 PosixPath('data/messy_excel.xlsx'),
 PosixPath('data/events.csv.gz'),
 PosixPath('data/multiline_quotes.csv'),
 PosixPath('data/utf8_file.csv'),
 PosixPath('data/broken_rows.csv')]

## 1. Character Encodings

In [2]:
utf8_path = DATA_DIR / "utf8_file.csv"
try:
    df_utf8 = pd.read_csv(utf8_path, encoding="ascii")
    print("No errors!")
except Exception as e:
    print("Error:", e)

Error: 'ascii' codec can't decode byte 0xc3 in position 18: ordinal not in range(128)


In [3]:
df_utf8 = pd.read_csv(utf8_path, encoding="utf-8")
df_utf8

Unnamed: 0,id,word,note
0,1,café,UTF-8 sample
1,2,jalapeño,contains accent
2,3,北京,Chinese chars


In [4]:
latin1_path = DATA_DIR / "latin1_file.csv"
try:
    df_latin1 = pd.read_csv(latin1_path)
    print("No errors!")
except UnicodeDecodeError as e:
    print("Error:", e)

Error: 'utf-8' codec can't decode byte 0xfc in position 19: invalid start byte


In [5]:
df_latin1 = pd.read_csv(latin1_path, encoding="latin1")
df_latin1

Unnamed: 0,id,city,comment
0,1,Zürich,naïve use
1,2,Málaga,piñata
2,3,São Paulo,façade


## 2. Malformed CSV Rows

In [6]:
broken_path = DATA_DIR / "broken_rows.csv"
try:
    df_broken = pd.read_csv(broken_path)
    print("No errors!")
except Exception as e:
    print("Error:", e)

Error: Error tokenizing data. C error: Expected 3 fields in line 3, saw 4



In [7]:
! head data/broken_rows.csv

id,name,amount
1,Alice,10.5
2,Bob,9.25,EXTRA
3,"Charlie, Jr.",7.10
4,Donna
5,"Eve says ""hello""",12.00


In [8]:
! awk -F"," '{print "Row count is: " NF}' data/broken_rows.csv | sort | uniq -c

      1 Row count is: 2
      3 Row count is: 3
      2 Row count is: 4


In [9]:
lines = open(broken_path).read().splitlines()
[(i+1, line, line.count(',')) for i, line in enumerate(lines)]

[(1, 'id,name,amount', 2),
 (2, '1,Alice,10.5', 2),
 (3, '2,Bob,9.25,EXTRA', 3),
 (4, '3,"Charlie, Jr.",7.10', 3),
 (5, '4,Donna', 1),
 (6, '5,"Eve says ""hello""",12.00', 2)]

In [10]:
from collections import Counter
Counter([line.count(',') for line in open(broken_path)])

Counter({2: 3, 3: 2, 1: 1})

In [11]:
df_fixed = pd.read_csv(broken_path, on_bad_lines="skip")
df_fixed

Unnamed: 0,id,name,amount
0,1,Alice,10.5
1,3,"Charlie, Jr.",7.1
2,4,Donna,
3,5,"Eve says ""hello""",12.0


## 3. Excel Pitfalls

In [12]:
xls_path = DATA_DIR / "messy_excel.xlsx"
xe = pd.ExcelFile(xls_path)
xe.sheet_names

['Quarterly', 'Notes']

In [13]:
df_quarterly = xe.parse("Quarterly")
df_quarterly

Unnamed: 0,Sales,Unnamed: 1,Unnamed: 2,city,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,Q1,Q2,Q3,MÃ¡laga,SÃ£o Paulo,ZÃ¼rich,naÃ¯ve
1,120.5,130,125.25,,,,
2,95,,110,,,,
3,88,92.5,,,,,
4,101,error,,,,,
5,,,,,,,
6,Report total,,,,,,


In [14]:
df_quarterly = xe.parse("Quarterly", header=1)
df_quarterly

Unnamed: 0,Q1,Q2,Q3,MÃ¡laga,SÃ£o Paulo,ZÃ¼rich,naÃ¯ve
0,120.5,130,125.25,,,,
1,95,,110.0,,,,
2,88,92.5,,,,,
3,101,error,,,,,
4,,,,,,,
5,Report total,,,,,,


In [15]:
# drop fully empty rows
df_quarterly = df_quarterly.dropna(how="all")
df_quarterly

Unnamed: 0,Q1,Q2,Q3,MÃ¡laga,SÃ£o Paulo,ZÃ¼rich,naÃ¯ve
0,120.5,130,125.25,,,,
1,95,,110.0,,,,
2,88,92.5,,,,,
3,101,error,,,,,
5,Report total,,,,,,


In [16]:
# drop the trailing "Report total" row (match anywhere in the row)
mask_footer = df_quarterly.astype(str).apply(
    lambda r: r.str.strip().str.lower().eq("report total").any(), axis=1
)
df_quarterly = df_quarterly[~mask_footer].copy()
df_quarterly

Unnamed: 0,Q1,Q2,Q3,MÃ¡laga,SÃ£o Paulo,ZÃ¼rich,naÃ¯ve
0,120.5,130,125.25,,,,
1,95.0,,110.0,,,,
2,88.0,92.5,,,,,
3,101.0,error,,,,,


In [17]:
for col in df_quarterly.columns[:3]:
    df_quarterly[col] = pd.to_numeric(df_quarterly[col], errors="coerce")

df_quarterly

Unnamed: 0,Q1,Q2,Q3,MÃ¡laga,SÃ£o Paulo,ZÃ¼rich,naÃ¯ve
0,120.5,130.0,125.25,,,,
1,95.0,,110.0,,,,
2,88.0,92.5,,,,,
3,101.0,,,,,,


In [18]:
# bad header names that should be a single column
bad_cols = list(df_quarterly.columns[3:])

# align those names to rows (truncate/exact match in length)
cities = bad_cols[:len(df_quarterly)]

# attach as a proper column
df_quarterly.loc[:, "city"] = pd.Series(cities, index=df_quarterly.index)

# drop the bogus all-NaN columns
df_quarterly = df_quarterly.drop(columns=bad_cols)

df_quarterly
 

Unnamed: 0,Q1,Q2,Q3,city
0,120.5,130.0,125.25,MÃ¡laga
1,95.0,,110.0,SÃ£o Paulo
2,88.0,92.5,,ZÃ¼rich
3,101.0,,,naÃ¯ve


In [19]:
df_quarterly["city"] = (
    df_quarterly["city"]
      .apply(lambda x: x.encode("latin1", errors="ignore").decode("utf-8", errors="ignore"))
      .str.strip()
)
df_quarterly

Unnamed: 0,Q1,Q2,Q3,city
0,120.5,130.0,125.25,Málaga
1,95.0,,110.0,São Paulo
2,88.0,92.5,,Zürich
3,101.0,,,naïve
