# 04 — File Handling + CSV (1DownLabs)

This notebook covers:
- file paths
- reading/writing text files
- reading/writing CSVs with pandas
- basic data validation
- generating a simple KPI report

In [1]:
import sys
from pathlib import Path

project_root = Path("..").resolve()
sys.path.append(str(project_root))

import pandas as pd

## Paths

We'll use `pathlib` so paths work reliably.

In [2]:
from pathlib import Path

raw_dir = project_root / "data" / "raw"
processed_dir = project_root / "data" / "processed"

raw_dir, processed_dir

(PosixPath('/Users/arpitshukla/1DownLabs/data-science/python-foundations/data/raw'),
 PosixPath('/Users/arpitshukla/1DownLabs/data-science/python-foundations/data/processed'))

In [3]:
raw_dir.mkdir(parents=True, exist_ok=True)
processed_dir.mkdir(parents=True, exist_ok=True)

In [4]:
notes_path = processed_dir / "notes.txt"

notes_path.write_text("1DownLabs — Notebook 04: file handling and CSV.\n")
print(notes_path.read_text())


1DownLabs — Notebook 04: file handling and CSV.



In [5]:
sales_data = [
    {"date": "2026-01-01", "region": "East", "sales": 1200, "orders": 45},
    {"date": "2026-01-01", "region": "West", "sales": 900, "orders": 35},
    {"date": "2026-01-02", "region": "East", "sales": 1400, "orders": 52},
    {"date": "2026-01-02", "region": "West", "sales": 800, "orders": 30},
    {"date": "2026-01-03", "region": "East", "sales": 1100, "orders": 40},
    {"date": "2026-01-03", "region": "West", "sales": 950, "orders": 38},
]

df = pd.DataFrame(sales_data)
df

Unnamed: 0,date,region,sales,orders
0,2026-01-01,East,1200,45
1,2026-01-01,West,900,35
2,2026-01-02,East,1400,52
3,2026-01-02,West,800,30
4,2026-01-03,East,1100,40
5,2026-01-03,West,950,38


In [6]:
raw_csv_path = raw_dir / "sales_sample.csv"
df.to_csv(raw_csv_path, index=False)
raw_csv_path

PosixPath('/Users/arpitshukla/1DownLabs/data-science/python-foundations/data/raw/sales_sample.csv')

In [8]:
df2 = pd.read_csv(raw_csv_path, parse_dates=["date"])
df2.dtypes

date      datetime64[us]
region               str
sales              int64
orders             int64
dtype: object

In [9]:
# Check for missing values
print(df2.isna().sum())

# Check for duplicates
print("Duplicates:", df2.duplicated().sum())

# Ensure expected columns exist
expected_cols = {"date", "region", "sales", "orders"}
print("All expected cols present:", expected_cols.issubset(set(df2.columns)))

date      0
region    0
sales     0
orders    0
dtype: int64
Duplicates: 0
All expected cols present: True


In [15]:
# Add average order value
df2["aov"] = df2["sales"] / df2["orders"]

# Sort by date
df2 = df2.sort_values(["date", "region"]).reset_index(drop=True)

df2

Unnamed: 0,index,date,region,sales,orders,aov
0,0,2026-01-01,East,1200,45,26.666667
1,1,2026-01-01,West,900,35,25.714286
2,2,2026-01-02,East,1400,52,26.923077
3,3,2026-01-02,West,800,30,26.666667
4,4,2026-01-03,East,1100,40,27.5
5,5,2026-01-03,West,950,38,25.0


In [16]:
summary = (
    df2.groupby("region")
      .agg(
          total_sales=("sales", "sum"),
          total_orders=("orders", "sum"),
          avg_aov=("aov", "mean")
      )
      .reset_index()
)

summary

Unnamed: 0,region,total_sales,total_orders,avg_aov
0,East,3700,137,27.029915
1,West,2650,103,25.793651


In [17]:
processed_csv_path = processed_dir / "sales_summary_by_region.csv"
summary.to_csv(processed_csv_path, index=False)
processed_csv_path

PosixPath('/Users/arpitshukla/1DownLabs/data-science/python-foundations/data/processed/sales_summary_by_region.csv')

In [18]:
report_path = processed_dir / "kpi_report.txt"

lines = []
lines.append("1DownLabs KPI Report\n")
lines.append("=" * 22 + "\n")

for _, row in summary.iterrows():
    lines.append(f"Region: {row['region']}\n")
    lines.append(f"  Total Sales : {row['total_sales']:.0f}\n")
    lines.append(f"  Total Orders: {row['total_orders']:.0f}\n")
    lines.append(f"  Avg AOV     : {row['avg_aov']:.2f}\n")
    lines.append("\n")

report_path.write_text("".join(lines))
print(report_path.read_text())


1DownLabs KPI Report
Region: East
  Total Sales : 3700
  Total Orders: 137
  Avg AOV     : 27.03

Region: West
  Total Sales : 2650
  Total Orders: 103
  Avg AOV     : 25.79




Downloading to /Users/arpitshukla/.cache/kagglehub/datasets/gregorut/videogamesales/2.archive...


100%|██████████| 381k/381k [00:00<00:00, 8.21MB/s]

Extracting files...
Path to dataset files: /Users/arpitshukla/.cache/kagglehub/datasets/gregorut/videogamesales/versions/2





In [24]:
# Read the CSV into a DataFrame (parse dates if present):

df = pd.read_csv("/Users/arpitshukla/1DownLabs/data-science/python-foundations/data/raw/vgsales.csv")
df.head(10)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


In [25]:
df.shape

(16598, 11)

In [35]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  str    
 2   Platform      16598 non-null  str    
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  str    
 5   Publisher     16540 non-null  str    
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), str(4)
memory usage: 1.4 MB


In [43]:
'''Q2. Missing data checks

For each column:

Print number of missing values

Print percentage of missing values'''


df.isna().sum()

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

In [60]:
print ("Duplicated: ",df.duplicated().sum())

Duplicated:  0


In [64]:
'''Q4. Standardize column names

Convert all column names to lowercase

Replace spaces with underscores'''



AttributeError: 'DataFrame' object has no attribute 'lower'