# Chapter 4 — File Handling (All Code Examples)



## 0) Environment checks and helper utilities

In [1]:
import os
from pathlib import Path
import pandas as pd
import pprint
from datetime import datetime

pp = pprint.PrettyPrinter(width=80)

BASE = Path(".").resolve()
print("Working directory:", BASE)

def ensure_parent(path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)

def write_text(path: Path, content: str, encoding="utf-8"):
    ensure_parent(path)
    path.write_text(content, encoding=encoding)
    return path

def file_exists(path: Path) -> bool:
    return path.exists() and path.is_file()

def info(path: Path):
    if path.exists():
        return {"exists": True, "size_bytes": path.stat().st_size}
    return {"exists": False, "size_bytes": None}


Working directory: /content


## 1) Reading text files (read full file)

In [2]:
# If the chapter file is not present, create a small demo file first.
demo_path = Path("employee_list.txt")

# Open a text file and read it fully
with open(demo_path, 'r', encoding='utf-8') as file:
    content = file.read()

print(content)


قائمة الموظفين الحاليين في الشركة:

1. أحمد محمد
2. فاطمة علي
3. خالد سعيد
4. مريم عبد الله
5. عمر يوسف
6. سارة محمود

تم تحديث القائمة في: 15-06-2025



## 2) Reading text files (read line-by-line with `readlines`) + cleaning with `strip`

In [3]:
# Create a demo names file if absent
names_path = Path("employee_names.txt")


# Read lines into a list
with open(names_path, 'r', encoding='utf-8') as file:
    employee_names = file.readlines()

# Remove newline characters and extra spaces
employee_names = [name.strip() for name in employee_names]

print("قائمة الموظفين:")
for name in employee_names:
    print(f"- {name}")


قائمة الموظفين:
- أحمد محمد
- فاطمة علي
- خالد سعيد
- مريم عبد الله
- عمر يوسف
- سارة محمود


## 3) Writing text files (simple report via `write`)

In [4]:
# Create a multi-line text report
report_content = """تقرير الموظفين اليومي
=====================
تاريخ التقرير: 2024-01-15
إجمالي عدد الموظفين: 365
الأقسام الرئيسية: المبيعات، الإنتاج، اللوجستيات
"""

# Save report to a text file
out_report = Path("daily_employee_report.txt")
with open(out_report, 'w', encoding='utf-8') as file:
    file.write(report_content)

print("تم حفظ التقرير بنجاح:", out_report.resolve())


تم حفظ التقرير بنجاح: /content/daily_employee_report.txt


## 4) Reading CSV manually (using `open`, `readline`, `split`) + pretty printing

In [5]:
# If the chapter CSV is not present, create a demo CSV file.
csv_path = Path("PracticeFile_EmployeeData_v2_Employee Data.csv")

import pprint
pp = pprint.PrettyPrinter(width=80)

with open(csv_path, 'r', encoding='utf-8') as file:
    headers = file.readline().strip().split(',')
    print("أعمدة البيانات:", headers)
    pp.pprint(headers)

    print("\nأول 5 موظفين:")
    for i in range(5):
        line = file.readline().strip()
        if line:
            employee_data = line.split(',')
            pp.pprint(f"الموظف {i+1}: {employee_data}")


أعمدة البيانات: ['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13']
['Unnamed: 0',
 'Unnamed: 1',
 'Unnamed: 2',
 'Unnamed: 3',
 'Unnamed: 4',
 'Unnamed: 5',
 'Unnamed: 6',
 'Unnamed: 7',
 'Unnamed: 8',
 'Unnamed: 9',
 'Unnamed: 10',
 'Unnamed: 11',
 'Unnamed: 12',
 'Unnamed: 13']

أول 5 موظفين:
("الموظف 1: ['', 'Employee ID', 'Gender', 'HireDate', 'Date of Birth', "
 '"Today\'s Date", \'Age\', \'Age Description\', \'Length of Service\', \'LOS '
 "Range', 'Department', 'Job Grade', 'Recruitment Sources', 'Performance "
 "Rating']")
("الموظف 2: ['', '0001', 'M', '2020-07-31 00:00:00', '1999-04-11 00:00:00', "
 "'2021-12-27 00:00:00', '22', 'Generation Z', '1', '0-3 years', 'Sales', "
 "'Officer', 'Corporate Job Fair', 'Fully Meets']")
("الموظف 3: ['', '0002', 'M', '2010-02-26 00:00:00', '1978-04-05 00:00:00', "
 "'2021-12-27 00:00:00', '43'

## 5) Reading CSV with pandas (`pd.read_csv`)

In [6]:
df = pd.read_csv("PracticeFile_EmployeeData_v2_Employee Data.csv", encoding="utf-8")

print("البيانات أعمدة:", df.columns.tolist())
print("\nأول 5 موظفين:")
for i, row in df.head(5).iterrows():
    print(f"الموظف {i+1}: {row.tolist()}")


البيانات أعمدة: ['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13']

أول 5 موظفين:
الموظف 1: [nan, 'Employee ID', 'Gender', 'HireDate', 'Date of Birth', "Today's Date", 'Age', 'Age Description', 'Length of Service', 'LOS Range', 'Department', 'Job Grade', 'Recruitment Sources', 'Performance Rating']
الموظف 2: [nan, '0001', 'M', '2020-07-31 00:00:00', '1999-04-11 00:00:00', '2021-12-27 00:00:00', '22', 'Generation Z', '1', '0-3 years', 'Sales', 'Officer', 'Corporate Job Fair', 'Fully Meets']
الموظف 3: [nan, '0002', 'M', '2010-02-26 00:00:00', '1978-04-05 00:00:00', '2021-12-27 00:00:00', '43', 'Generation X', '11', '8-14 years', 'Sales', 'Officer', 'Social Media Platforms', 'Fully Meets']
الموظف 4: [nan, '0003', 'M', '2007-12-12 00:00:00', '1963-06-10 00:00:00', '2021-12-27 00:00:00', '58', 'Baby Boomers', '14', '8-14 years', 'Production', 

## 6) Checking file existence, size, and read permission (os module)

In [7]:
import os

file_path = "PracticeFile_EmployeeData_v2_Employee Data.csv"

if os.path.exists(file_path):
    print(f"الملف موجود: {file_path}")

    file_size = os.path.getsize(file_path)
    print(f"حجم الملف: {file_size} بايت")

    if os.access(file_path, os.R_OK):
        print("الملف قابل للقراءة")
    else:
        print("الملف غير قابل للقراءة")
else:
    print(f"الملف غير موجود: {file_path}")


الملف موجود: PracticeFile_EmployeeData_v2_Employee Data.csv
حجم الملف: 52421 بايت
الملف قابل للقراءة


## 7) Creating folders and saving a monthly text summary

In [8]:
import os

reports_folder = "HR_Reports"
if not os.path.exists(reports_folder):
    os.makedirs(reports_folder)
    print(f"تم إنشاء مجلد: {reports_folder}")
else:
    print(f"المجلد موجود بالفعل: {reports_folder}")

# A minimal employee_records placeholder (chapter refers to it conceptually)
employee_records = ["E001", "E002", "E003", "E004", "E005"]

report_path = os.path.join(reports_folder, "monthly_summary.txt")

with open(report_path, 'w', encoding='utf-8') as file:
    file.write("تقرير الموارد البشرية الشهري\n")
    file.write("="*30 + "\n")
    file.write(f"إجمالي الموظفين: {len(employee_records)}\n")

print(f"تم حفظ التقرير في: {report_path}")


تم إنشاء مجلد: HR_Reports
تم حفظ التقرير في: HR_Reports/monthly_summary.txt


## 8) Exception handling when opening files (robust reader)

In [9]:
def read_employee_file_safely(file_path):
    """قراءة ملف الموظفين مع التعامل مع الأخطاء المحتملة"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        print(f"تم قراءة الملف بنجاح: {file_path}")
        return content

    except FileNotFoundError:
        print(f"خطأ: الملف غير موجود - {file_path}")
        return None

    except PermissionError:
        print(f"خطأ: لا توجد صلاحية لقراءة الملف - {file_path}")
        return None

    except UnicodeDecodeError:
        print(f"خطأ: مشكلة في ترميز الملف - {file_path}")
        print("محاولة قراءة الملف بترميز مختلف (latin-1)...")
        try:
            with open(file_path, 'r', encoding='latin-1') as file:
                content = file.read()
            print("تم قراءة الملف بترميز بديل")
            return content
        except Exception:
            print("فشل في قراءة الملف بأي ترميز")
            return None

    except Exception as e:
        print(f"خطأ غير متوقع: {str(e)}")
        return None

file_content = read_employee_file_safely("PracticeFile_EmployeeData_v2_Employee Data.csv")
if file_content:
    print("تم تحميل البيانات بنجاح!")


تم قراءة الملف بنجاح: PracticeFile_EmployeeData_v2_Employee Data.csv
تم تحميل البيانات بنجاح!


## 9) Working with Excel (load all sheets, inspect a sheet)

If the chapter Excel file is not available, this notebook will generate a small demo Excel file with one sheet.


In [10]:
excel_path = Path("Charts_Case_Study_Final_Raw_data_to_clean.xlsx")

# Read all sheets
all_sheets = pd.read_excel(excel_path, sheet_name=None)
print("أوراق البيانات المتاحة:", list(all_sheets.keys()))

# Read a specific sheet
hr_data = pd.read_excel(excel_path, sheet_name="Charts_Case_Study_Final_Raw dat")
hr_data.head()


أوراق البيانات المتاحة: ['Charts_Case_Study_Final_Raw dat']


Unnamed: 0,S.no,Employee Name,Employee Number,State,Zip,Age,Sex,MaritalDesc,CitizenDesc,Date of Hire,Department,Position,Pay Rate,Manager Name,Employee Source,Performance Score,Email
0,1.0,"Brown, Mia",1103024456,MA,1450,32,Female,Married,US Citizen,2020-07-02,Admin Offices,Accountant I,28.5,Brandon R. LeBlanc,Internal,Fully Meets,email1@example.com
1,2.0,"LaRotonda, William",1106026572,MA,1460,33,Male,Divorced,US Citizen,2020-07-08,Admin Offices,Accountant I,23.0,Brandon R. LeBlanc,Website Banner Ads,Fully Meets,email2@example.net
2,3.0,"Steans, Tyrone",1302053333,MA,2703,31,Male,Single,US Citizen,2020-07-14,Admin Offices,Accountant I,29.0,Brandon R. LeBlanc,Internet Search,Fully Meets,email3@example.org
3,4.0,"Howard, Estelle",1211050782,MA,2170,32,Female,Married,US Citizen,2020-07-15,Admin Offices,Administrative Assistant,21.5,Brandon R. LeBlanc,Pay Per Click - Google,N/A- too early to review,email4@example.info
4,5.0,"Singh, Nan",1307059817,MA,2330,29,Female,Single,US Citizen,2020-07-12,Admin Offices,Administrative Assistant,16.56,Brandon R. LeBlanc,Website Banner Ads,N/A- too early to review,email5@example.biz


## 10) Inspect structure, columns, info, and descriptive statistics

In [12]:
print(hr_data.shape)

print("\nأسماء الأعمدة:")
pp.pprint(hr_data.columns.tolist())

print("\nمعلومات البيانات:")
hr_data.info()

print("\nالإحصائيات الوصفية:")
# Filter for numeric columns before calling describe for compatibility with older pandas versions
numeric_hr_data = hr_data.select_dtypes(include=['number'])
print(numeric_hr_data.describe())

(44, 17)

أسماء الأعمدة:
['S.no',
 'Employee Name',
 'Employee Number',
 'State',
 'Zip',
 'Age',
 'Sex',
 'MaritalDesc',
 'CitizenDesc',
 'Date of Hire',
 'Department',
 'Position',
 'Pay Rate',
 'Manager Name',
 'Employee Source',
 'Performance Score',
 'Email']

معلومات البيانات:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   S.no               43 non-null     float64       
 1   Employee Name      44 non-null     object        
 2   Employee Number    44 non-null     int64         
 3   State              44 non-null     object        
 4   Zip                44 non-null     int64         
 5   Age                44 non-null     int64         
 6   Sex                44 non-null     object        
 7   MaritalDesc        44 non-null     object        
 8   CitizenDesc        44 non-null     object        
 9   Dat

## 11) Missing values: count, drop rows with missing `S.no`, fill `Position` with a default

In [13]:
missing_values = hr_data.isnull().sum()
print("القيم المفقودة في كل عمود:")
print(missing_values[missing_values > 0])

hr_data = hr_data.dropna(subset=["S.no"])
hr_data = hr_data.fillna({"Position": "غير محدد"})

print("\nبعد المعالجة:")
print(hr_data.shape)

distinct_values = hr_data["Position"].unique()
print("\nالقيم الفريدة في Position:")
print(distinct_values)


القيم المفقودة في كل عمود:
S.no        1
Position    1
dtype: int64

بعد المعالجة:
(43, 17)

القيم الفريدة في Position:
['Accountant I' 'Administrative Assistant' 'Shared Services Manager'
 'Sr. Accountant' 'President & CEO' 'CIO' 'Database Administrator'
 'IT Director' 'IT Manager - DB' 'IT Manager - Infra'
 'IT Manager - Support' 'IT Support' 'غير محدد' 'Network Engineer'
 'Sr. DBA']


## 12) Department distribution and average pay rate (loop + dictionaries)

In [14]:
department_counts = {}
for dept in hr_data["Department"]:
    if dept in department_counts:
        department_counts[dept] += 1
    else:
        department_counts[dept] = 1

print("توزيع الموظفين حسب القسم:")
for dept, count in department_counts.items():
    print(f"{dept}: {count}")

# Average salary by department (if column exists)
if "Pay Rate" in hr_data.columns:
    salary_sums = {}
    salary_counts = {}

    for dept, salary in zip(hr_data["Department"], hr_data["Pay Rate"]):
        if dept in salary_sums:
            salary_sums[dept] += salary
            salary_counts[dept] += 1
        else:
            salary_sums[dept] = salary
            salary_counts[dept] = 1

    print("\nمتوسط الراتب حسب القسم:")
    for dept in salary_sums:
        average_salary = salary_sums[dept] / salary_counts[dept]
        print(f"{dept}: {average_salary:.2f}")


توزيع الموظفين حسب القسم:
Admin Offices: 10
Executive Office: 2
IT/IS: 31

متوسط الراتب حسب القسم:
Admin Offices: 31.90
Executive Office: 55.70
IT/IS: 42.19


## 13) Basic preprocessing: dates, years of service, text cleaning, email validation

In [15]:
# Convert date column to datetime
hr_data["Date of Hire"] = pd.to_datetime(hr_data["Date of Hire"], errors="coerce")

# Years of service
current_date = datetime.now()
hr_data["Years_of_Service"] = (current_date - hr_data["Date of Hire"]).dt.days / 365.25

# Clean employee names and validate emails (presence of '@')
if "Employee Name" in hr_data.columns:
    hr_data["Employee Name"] = hr_data["Employee Name"].astype(str).str.strip()
    hr_data["Employee Name"] = hr_data["Employee Name"].str.title()

if "Email" in hr_data.columns:
    invalid_emails = hr_data[~hr_data["Email"].astype(str).str.contains("@", na=False)]
    print(f"عدد الإيميلات غير الصحيحة: {len(invalid_emails)}")

valid_emails = hr_data["Email"].astype(str).str.contains("@", na=False)
hr_data_cleaned = hr_data[valid_emails].copy()

hr_data_cleaned.head()


عدد الإيميلات غير الصحيحة: 1


Unnamed: 0,S.no,Employee Name,Employee Number,State,Zip,Age,Sex,MaritalDesc,CitizenDesc,Date of Hire,Department,Position,Pay Rate,Manager Name,Employee Source,Performance Score,Email,Years_of_Service
0,1.0,"Brown, Mia",1103024456,MA,1450,32,Female,Married,US Citizen,2020-07-02,Admin Offices,Accountant I,28.5,Brandon R. LeBlanc,Internal,Fully Meets,email1@example.com,5.54141
1,2.0,"Larotonda, William",1106026572,MA,1460,33,Male,Divorced,US Citizen,2020-07-08,Admin Offices,Accountant I,23.0,Brandon R. LeBlanc,Website Banner Ads,Fully Meets,email2@example.net,5.524983
2,3.0,"Steans, Tyrone",1302053333,MA,2703,31,Male,Single,US Citizen,2020-07-14,Admin Offices,Accountant I,29.0,Brandon R. LeBlanc,Internet Search,Fully Meets,email3@example.org,5.508556
3,4.0,"Howard, Estelle",1211050782,MA,2170,32,Female,Married,US Citizen,2020-07-15,Admin Offices,Administrative Assistant,21.5,Brandon R. LeBlanc,Pay Per Click - Google,N/A- too early to review,email4@example.info,5.505818
4,5.0,"Singh, Nan",1307059817,MA,2330,29,Female,Single,US Citizen,2020-07-12,Admin Offices,Administrative Assistant,16.56,Brandon R. LeBlanc,Website Banner Ads,N/A- too early to review,email5@example.biz,5.514031


## 14) Export cleaned data to a new Excel file

In [16]:
out_xlsx = Path("HR_Summary_Report.xlsx")
with pd.ExcelWriter(out_xlsx) as writer:
    hr_data_cleaned.to_excel(writer, sheet_name="Clean_Data", index=False)

print("تم حفظ التقرير في ملف:", out_xlsx.resolve())


تم حفظ التقرير في ملف: /content/HR_Summary_Report.xlsx
