In [1]:
import pandas as pd

In [2]:
# pip install openpyxl pandas

In [3]:
imports_exports_path = "data/Exports_Imports_by_ISIC_2023_2024.xlsx"
sut_io_path = "data/SUT and IO By Divisions -En.xlsx"

In [4]:
# cleaning imports and exports data

xls = pd.ExcelFile(imports_exports_path)
xls.sheet_names

['Exports', 'Imports', 'Sheet1', 'NOTE']

In [5]:
imports = pd.read_excel(imports_exports_path, sheet_name="Imports", header=5)
exports = pd.read_excel(imports_exports_path, sheet_name="Exports", header=5)

In [6]:
# === Drop fully empty rows and columns ===
imports.dropna(how="all", inplace=True)
exports.dropna(how="all", inplace=True)

imports.dropna(axis=1, how="all", inplace=True)
exports.dropna(axis=1, how="all", inplace=True)

In [7]:
def clean_columns(df):
    df.columns = (
        df.columns.astype(str)
        .str.strip()
        .str.replace("\n", " ")
        .str.replace(r"\s+", " ", regex=True)
    )
    return df

imports = clean_columns(imports)
exports = clean_columns(exports)

# Reset index and remove junk header rows if any remain
imports.reset_index(drop=True, inplace=True)
exports.reset_index(drop=True, inplace=True)

In [8]:
# Remove rows where the industry column is NaN (if present)
if "CC_DESC_EN" in imports.columns:
    imports = imports[imports["CC_DESC_EN"].notna()]
if "CC_DESC_EN" in exports.columns:
    exports = exports[exports["CC_DESC_EN"].notna()]

In [9]:
imports.head(2)

Unnamed: 0,CT_TYPE,CC_CODE,CC_DESC_AR,CC_DESC_EN,COMMODTIY_TYPE,COMMODTIY_CODE,COMM_NAME_AR,COMM_NAME_EN,2023,2024,Growth,Share
0,ISI4,24.0,صُنع الفلّزات القاعدية,Manufacture of basic metals,H,71081210.0,سبائك ذهب,GOLD INGOTS,27411960000.0,30524790000.0,0.113558,0.034964
1,ISI4,19.0,صُنع فحم الكوك والمنتجات النفطية المكررة,Manufacture of coke and refined petroleum prod...,H,27101249.0,غيرها من زيوت وقود,OTHER FUEL OILS,24823160000.0,25730140000.0,0.036538,0.029472


In [10]:
exports.head(2)

Unnamed: 0,CT_TYPE,CC_CODE,CC_DESC_AR,CC_DESC_EN,COMMODTIY_TYPE,COMMODTIY_CODE,COMM_NAME_AR,COMM_NAME_EN,2023,2024
0,ISI4,1,أنشطة زراعة المحاصيل والإنتاج الحيواني والصيد ...,"Crop and animal production, hunting and relate...",H,1012910,خيول للرياضة,Horses for sport,1817329.0,11227.0
1,ISI4,1,أنشطة زراعة المحاصيل والإنتاج الحيواني والصيد ...,"Crop and animal production, hunting and relate...",H,1012990,خيول غير مذكورة في مكان آخر,Horses not elsewhere specified,2582924.0,2003653.0


In [11]:
imports.to_csv("cleaned_data/cleaned_imports.csv", index=False)
exports.to_csv("cleaned_data/cleaned_exports.csv", index=False)