# 04 — File Handling + CSV (1DownLabs)

This notebook covers:
- file paths
- reading/writing text files
- reading/writing CSVs with pandas
- basic data validation
- generating a simple KPI report

In [1]:
import sys
from pathlib import Path

project_root = Path("..").resolve()
sys.path.append(str(project_root))

import pandas as pd

## Paths

We'll use `pathlib` so paths work reliably.

In [2]:
from pathlib import Path

raw_dir = project_root / "data" / "raw"
processed_dir = project_root / "data" / "processed"

raw_dir, processed_dir

(PosixPath('/Users/arpitshukla/1DownLabs/data-science/python-foundations/data/raw'),
 PosixPath('/Users/arpitshukla/1DownLabs/data-science/python-foundations/data/processed'))

In [3]:
raw_dir.mkdir(parents=True, exist_ok=True)
processed_dir.mkdir(parents=True, exist_ok=True)

In [4]:
notes_path = processed_dir / "notes.txt"

notes_path.write_text("1DownLabs — Notebook 04: file handling and CSV.\n")
print(notes_path.read_text())


1DownLabs — Notebook 04: file handling and CSV.



In [5]:
sales_data = [
    {"date": "2026-01-01", "region": "East", "sales": 1200, "orders": 45},
    {"date": "2026-01-01", "region": "West", "sales": 900, "orders": 35},
    {"date": "2026-01-02", "region": "East", "sales": 1400, "orders": 52},
    {"date": "2026-01-02", "region": "West", "sales": 800, "orders": 30},
    {"date": "2026-01-03", "region": "East", "sales": 1100, "orders": 40},
    {"date": "2026-01-03", "region": "West", "sales": 950, "orders": 38},
]

df = pd.DataFrame(sales_data)
df

Unnamed: 0,date,region,sales,orders
0,2026-01-01,East,1200,45
1,2026-01-01,West,900,35
2,2026-01-02,East,1400,52
3,2026-01-02,West,800,30
4,2026-01-03,East,1100,40
5,2026-01-03,West,950,38


In [6]:
raw_csv_path = raw_dir / "sales_sample.csv"
df.to_csv(raw_csv_path, index=False)
raw_csv_path

PosixPath('/Users/arpitshukla/1DownLabs/data-science/python-foundations/data/raw/sales_sample.csv')

In [8]:
df2 = pd.read_csv(raw_csv_path, parse_dates=["date"])
df2.dtypes

date      datetime64[us]
region               str
sales              int64
orders             int64
dtype: object

In [9]:
# Check for missing values
print(df2.isna().sum())

# Check for duplicates
print("Duplicates:", df2.duplicated().sum())

# Ensure expected columns exist
expected_cols = {"date", "region", "sales", "orders"}
print("All expected cols present:", expected_cols.issubset(set(df2.columns)))

date      0
region    0
sales     0
orders    0
dtype: int64
Duplicates: 0
All expected cols present: True


In [15]:
# Add average order value
df2["aov"] = df2["sales"] / df2["orders"]

# Sort by date
df2 = df2.sort_values(["date", "region"]).reset_index(drop=True)

df2

Unnamed: 0,index,date,region,sales,orders,aov
0,0,2026-01-01,East,1200,45,26.666667
1,1,2026-01-01,West,900,35,25.714286
2,2,2026-01-02,East,1400,52,26.923077
3,3,2026-01-02,West,800,30,26.666667
4,4,2026-01-03,East,1100,40,27.5
5,5,2026-01-03,West,950,38,25.0


In [16]:
summary = (
    df2.groupby("region")
      .agg(
          total_sales=("sales", "sum"),
          total_orders=("orders", "sum"),
          avg_aov=("aov", "mean")
      )
      .reset_index()
)

summary

Unnamed: 0,region,total_sales,total_orders,avg_aov
0,East,3700,137,27.029915
1,West,2650,103,25.793651


In [17]:
processed_csv_path = processed_dir / "sales_summary_by_region.csv"
summary.to_csv(processed_csv_path, index=False)
processed_csv_path

PosixPath('/Users/arpitshukla/1DownLabs/data-science/python-foundations/data/processed/sales_summary_by_region.csv')

In [18]:
report_path = processed_dir / "kpi_report.txt"

lines = []
lines.append("1DownLabs KPI Report\n")
lines.append("=" * 22 + "\n")

for _, row in summary.iterrows():
    lines.append(f"Region: {row['region']}\n")
    lines.append(f"  Total Sales : {row['total_sales']:.0f}\n")
    lines.append(f"  Total Orders: {row['total_orders']:.0f}\n")
    lines.append(f"  Avg AOV     : {row['avg_aov']:.2f}\n")
    lines.append("\n")

report_path.write_text("".join(lines))
print(report_path.read_text())


1DownLabs KPI Report
Region: East
  Total Sales : 3700
  Total Orders: 137
  Avg AOV     : 27.03

Region: West
  Total Sales : 2650
  Total Orders: 103
  Avg AOV     : 25.79




Downloading to /Users/arpitshukla/.cache/kagglehub/datasets/gregorut/videogamesales/2.archive...


100%|██████████| 381k/381k [00:00<00:00, 8.21MB/s]

Extracting files...
Path to dataset files: /Users/arpitshukla/.cache/kagglehub/datasets/gregorut/videogamesales/versions/2





In [24]:
# Read the CSV into a DataFrame (parse dates if present):

df = pd.read_csv("/Users/arpitshukla/1DownLabs/data-science/python-foundations/data/raw/vgsales.csv")
df.head(10)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


In [25]:
df.shape

(16598, 11)

In [35]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  str    
 2   Platform      16598 non-null  str    
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  str    
 5   Publisher     16540 non-null  str    
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), str(4)
memory usage: 1.4 MB


In [65]:
'''Q2. Missing data checks

For each column:

Print number of missing values

Print percentage of missing values'''


missing_value_per_column = df.isna().sum()
print ("Missing value per column", missing_value_per_column)

Missing value per column Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64


In [66]:
total_missing_value_in_dataset = df.isna().sum().sum()
print ("total missing values in the dataset:", total_missing_value_in_dataset)

total missing values in the dataset: 329


In [71]:
missing_value_percentage_per_column = df.isna().mean() * 100.0 
print ("Missing value percentage per column:", missing_value_percentage_per_column)

Missing value percentage per column: Rank            0.000000
Name            0.000000
Platform        0.000000
Year            1.632727
Genre           0.000000
Publisher       0.349440
NA_Sales        0.000000
EU_Sales        0.000000
JP_Sales        0.000000
Other_Sales     0.000000
Global_Sales    0.000000
dtype: float64


In [72]:
print ("Duplicated: ",df.duplicated().sum())

Duplicated:  0


In [74]:
df.drop_duplicates(inplace=True)

Original Column Name: Index(['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'],
      dtype='str')
Columns converted to lower case Index(['rank', 'name', 'platform', 'year', 'genre', 'publisher', 'na_sales',
       'eu_sales', 'jp_sales', 'other_sales', 'global_sales'],
      dtype='str')


In [78]:
df.head()

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [79]:
'''Q4. Standardize column names

Convert all column names to lowercase

Replace spaces with underscores'''

print ("Original Column Name:", df.columns)
print ("Columns converted to lower case", df.columns.str.lower())
df.columns = df.columns.str.lower()

Original Column Name: Index(['rank', 'name', 'platform', 'year', 'genre', 'publisher', 'na_sales',
       'eu_sales', 'jp_sales', 'other_sales', 'global_sales'],
      dtype='str')
Columns converted to lower case Index(['rank', 'name', 'platform', 'year', 'genre', 'publisher', 'na_sales',
       'eu_sales', 'jp_sales', 'other_sales', 'global_sales'],
      dtype='str')


In [96]:
'''The dataset has a “Year” column — some entries may be non-numeric or missing.

Retain only rows where year is a valid integer

Convert year column to int dtype'''

df["year"].dropna(inplace = True)

In [111]:
df = df.dropna(how="any",subset=["year"])

In [113]:
df["year"] = df["year"].astype("int")

In [134]:
'''Q6. Top 5 publishers by total global sales

Group by publisher

Calculate sum of global_sales

Sort descending

Show top 5 publishers with total_global_sales'''

import numpy as np 
top_5_publisher = df.groupby("publisher").agg({'global_sales': np.sum}).sort_values('global_sales', ascending=False).head(5)
top_5_publisher

Unnamed: 0_level_0,global_sales
publisher,Unnamed: 1_level_1
Nintendo,1784.43
Electronic Arts,1093.39
Activision,721.41
Sony Computer Entertainment,607.28
Ubisoft,473.54


In [137]:
'''Q7. Region sales total per year

For each region:

NA_Sales

EU_Sales

JP_Sales

Other_Sales

Compute total sales per year

Show output as a DataFrame with year as index and totals per region.'''

yealy_total_regional = df.groupby("year").agg({"na_sales" : np.sum, "eu_sales" : np.sum , "jp_sales" : np.sum, "other_sales" : np.sum})

yealy_total_regional

Unnamed: 0_level_0,na_sales,eu_sales,jp_sales,other_sales
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980,10.59,0.67,0.0,0.12
1981,33.4,1.96,0.0,0.32
1982,26.92,1.65,0.0,0.31
1983,7.76,0.8,8.1,0.14
1984,33.28,2.1,14.27,0.7
1985,33.73,4.74,14.56,0.92
1986,12.5,2.84,19.81,1.93
1987,8.46,1.41,11.63,0.2
1988,23.87,6.59,15.76,0.99
1989,45.15,8.44,18.36,1.5


In [149]:
'''For each year:

Find the genre with the highest total global_sales

Output a DataFrame like:

| year | top_genre | global_sales |'''

result = (df.groupby(["year", "genre"])
            .agg({"global_sales": "sum"})
            .sort_values(["year", "global_sales"], ascending=False)
            .groupby(level=0).head(1))

result

Unnamed: 0_level_0,Unnamed: 1_level_0,global_sales
year,genre,Unnamed: 2_level_1
2020,Simulation,0.29
2017,Role-Playing,0.04
2016,Action,19.91
2015,Action,70.7
2014,Action,99.02
2013,Action,125.22
2012,Action,122.04
2011,Action,118.96
2010,Action,117.64
2009,Action,139.36


In [165]:
# Top 10 games (titles) with global_sales > 2 million units

result_2 = df.groupby('name').agg({'global_sales' : "sum"}).sort_values('global_sales', ascending = False)
result2 = result_2[result_2['global_sales'] > 2].head(10)
result2

Unnamed: 0_level_0,global_sales
name,Unnamed: 1_level_1
Wii Sports,82.74
Grand Theft Auto V,55.92
Super Mario Bros.,45.31
Tetris,35.84
Mario Kart Wii,35.82
Wii Sports Resort,33.0
Pokemon Red/Pokemon Blue,31.37
Call of Duty: Modern Warfare 3,30.83
New Super Mario Bros.,30.01
Call of Duty: Black Ops II,29.72


In [169]:
'''Q13. Sales Share by Publisher

Compute % share of each publisher’s global sales relative to total global sales

Show top 10 publishers by percentage'''

total_global_sales = df['global_sales'].sum()

publisher_sales = df.groupby('publisher').agg({'global_sales' : 'sum'}).sort_values('global_sales', ascending= False).head(10)
publisher_sales["Percent_share"] = (publisher_sales["global_sales"] / total_global_sales) * 100.0
publisher_sales

Unnamed: 0_level_0,global_sales,Percent_share
publisher,Unnamed: 1_level_1,Unnamed: 2_level_1
Nintendo,1784.43,20.230807
Electronic Arts,1093.39,12.396206
Activision,721.41,8.178918
Sony Computer Entertainment,607.28,6.88498
Ubisoft,473.54,5.368715
Take-Two Interactive,399.3,4.527026
THQ,340.44,3.859706
Konami Digital Entertainment,278.56,3.158148
Sega,270.7,3.069036
Namco Bandai Games,253.65,2.875733
