## **DATA INGESTION**

In [2]:
import pandas as pd
import numpy as np


def ingestion(repo_url: str, file_type: str = 'csv', **kwargs):

    try:
        if 'github.com' in repo_url and 'raw' not in repo_url:
            repo_url = repo_url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")

        if file_type == 'csv':
            df = pd.read_csv(repo_url, **kwargs)
        elif file_type == 'json':
            df = pd.read_json(repo_url, **kwargs)
        elif file_type in ['xlsx', 'xls', 'excel']:
            df = pd.read_excel(repo_url, **kwargs)
        else:
            raise ValueError(f"Unsupported file type: {file_type}")

        print(f"Successfully loaded {file_type.upper()} file from GitHub.")
        return df

    except Exception as e:
        print(f"Failed to ingest data: {e}")
        return None

df = ingestion("https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/EasyVisa%20(1).csv", "csv")
print(df.head(10))

Successfully loaded CSV file from GitHub.
  case_id      continent education_of_employee has_job_experience  \
0  EZYV01           Asia           High School                  N   
1  EZYV02           Asia              Master's                  Y   
2  EZYV03           Asia            Bachelor's                  N   
3  EZYV04           Asia            Bachelor's                  N   
4  EZYV05         Africa              Master's                  Y   
5  EZYV06           Asia              Master's                  Y   
6  EZYV07           Asia            Bachelor's                  N   
7  EZYV08  North America            Bachelor's                  Y   
8  EZYV09           Asia            Bachelor's                  N   
9  EZYV10         Europe             Doctorate                  Y   

  requires_job_training  no_of_employees  yr_of_estab region_of_employment  \
0                     N            14513         2007                 West   
1                     N             2412  

## **DATA INFO**

In [15]:

def inform(df: pd.DataFrame):
    summary = {}

    # Basic info
    summary['shape'] = df.shape
    summary['columns'] = list(df.columns)
    summary['data_types'] = df.dtypes.to_dict()

    # Missing values
    summary['missing_values'] = df.isnull().sum().to_dict()
    summary['missing_percentage'] = (df.isnull().mean() * 100).round(2).to_dict()

    # Duplicates
    summary['duplicate_rows'] = df.duplicated().sum()

    # Descriptive statistics
    summary['numeric_summary'] = df.describe(include=[np.number]).to_dict()
    summary['categorical_summary'] = df.describe(include=['object', 'category']).to_dict()

    # Unique values per column
    summary['unique_values'] = {col: df[col].nunique() for col in df.columns}

    # Correlations
    if df.select_dtypes(include=[np.number]).shape[1] > 1:
        summary['correlation_matrix'] = df.corr(numeric_only=True).round(2).to_dict()

    print("Dataset Info Complete!")
    print(f"Shape: {summary['shape']}")
    print(f"Columns: {len(summary['columns'])}")
    print(f"Missing Values: {sum(df.isnull().sum())}")
    print(f"Duplicate Rows: {summary['duplicate_rows']}")
    
    return summary

inform(df)


Dataset Info Complete!
Shape: (25480, 12)
Columns: 12
Missing Values: 0
Duplicate Rows: 0


{'shape': (25480, 12),
 'columns': ['case_id',
  'continent',
  'education_of_employee',
  'has_job_experience',
  'requires_job_training',
  'no_of_employees',
  'yr_of_estab',
  'region_of_employment',
  'prevailing_wage',
  'unit_of_wage',
  'full_time_position',
  'case_status'],
 'data_types': {'case_id': dtype('O'),
  'continent': dtype('O'),
  'education_of_employee': dtype('O'),
  'has_job_experience': dtype('O'),
  'requires_job_training': dtype('O'),
  'no_of_employees': dtype('int64'),
  'yr_of_estab': dtype('int64'),
  'region_of_employment': dtype('O'),
  'prevailing_wage': dtype('float64'),
  'unit_of_wage': dtype('O'),
  'full_time_position': dtype('O'),
  'case_status': dtype('O')},
 'missing_values': {'case_id': 0,
  'continent': 0,
  'education_of_employee': 0,
  'has_job_experience': 0,
  'requires_job_training': 0,
  'no_of_employees': 0,
  'yr_of_estab': 0,
  'region_of_employment': 0,
  'prevailing_wage': 0,
  'unit_of_wage': 0,
  'full_time_position': 0,
  'case_

## **DATA CLEANING**

In [21]:
df_eda = df.copy()

In [22]:
df_eda.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [23]:
inform(df_eda)

Dataset Info Complete!
Shape: (25480, 12)
Columns: 12
Missing Values: 0
Duplicate Rows: 0


{'shape': (25480, 12),
 'columns': ['case_id',
  'continent',
  'education_of_employee',
  'has_job_experience',
  'requires_job_training',
  'no_of_employees',
  'yr_of_estab',
  'region_of_employment',
  'prevailing_wage',
  'unit_of_wage',
  'full_time_position',
  'case_status'],
 'data_types': {'case_id': dtype('O'),
  'continent': dtype('O'),
  'education_of_employee': dtype('O'),
  'has_job_experience': dtype('O'),
  'requires_job_training': dtype('O'),
  'no_of_employees': dtype('int64'),
  'yr_of_estab': dtype('int64'),
  'region_of_employment': dtype('O'),
  'prevailing_wage': dtype('float64'),
  'unit_of_wage': dtype('O'),
  'full_time_position': dtype('O'),
  'case_status': dtype('O')},
 'missing_values': {'case_id': 0,
  'continent': 0,
  'education_of_employee': 0,
  'has_job_experience': 0,
  'requires_job_training': 0,
  'no_of_employees': 0,
  'yr_of_estab': 0,
  'region_of_employment': 0,
  'prevailing_wage': 0,
  'unit_of_wage': 0,
  'full_time_position': 0,
  'case_

In [24]:

def clean(df: pd.DataFrame,inplace: bool = False):
    # Standardize column names
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
        .str.replace("-", "_")
    )


    # Replace blank strings or whitespace-only cells with NaN
    df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

    # Handle missing values
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            # Fill numeric missing values with median
            df[col].fillna(df[col].median(), inplace=True)
        elif df[col].dtype == 'object':
            # Fill categorical missing values with mode
            if df[col].mode().shape[0] > 0:
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col].fillna("Unknown", inplace=True)
        else:
            df[col].fillna("Unknown", inplace=True)

    # Handle invalid numeric values
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

    # Handle categorical inconsistencies (case normalization)
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip().str.lower()


    # Convert categorical columns with low cardinality to category dtype
    for col in df.select_dtypes(include='object').columns:
        if df[col].nunique() <= 20:
            df[col] = df[col].astype('category')


    print("Data cleaning complete!")
    print(f"Final shape: {df.shape}")
    return df

clean(df_eda)

Data cleaning complete!
Final shape: (25480, 12)


Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,ezyv01,asia,high school,n,n,14513,2007,west,592.2029,hour,y,denied
1,ezyv02,asia,master's,y,n,2412,2002,northeast,83425.6500,year,y,certified
2,ezyv03,asia,bachelor's,n,y,44444,2008,west,122996.8600,year,y,denied
3,ezyv04,asia,bachelor's,n,n,98,1897,west,83434.0300,year,y,denied
4,ezyv05,africa,master's,y,n,1082,2005,south,149907.3900,year,y,certified
...,...,...,...,...,...,...,...,...,...,...,...,...
25475,ezyv25476,asia,bachelor's,y,y,2601,2008,south,77092.5700,year,y,certified
25476,ezyv25477,asia,high school,y,n,3274,2006,northeast,279174.7900,year,y,certified
25477,ezyv25478,asia,master's,y,n,1121,1910,south,146298.8500,year,n,certified
25478,ezyv25479,asia,master's,y,y,1918,1887,west,86154.7700,year,y,certified


## **DATA COPY SAVING**

In [27]:
import os

def save_data(df: pd.DataFrame, file_name: str = "cleaned_data.csv", folder: str = "data_output"):

    # Ensure folder exists
    os.makedirs(folder, exist_ok=True)

    # Construct full path
    file_path = os.path.join(folder, file_name)

    # Save based on file extension
    ext = os.path.splitext(file_name)[1].lower()

    try:
        if ext == ".csv":
            df.to_csv(file_path, index=False)
        elif ext in [".xlsx", ".xls"]:
            df.to_excel(file_path, index=False)
        elif ext == ".parquet":
            df.to_parquet(file_path, index=False)
        else:
            raise ValueError("Unsupported file format. Use .csv, .xlsx, or .parquet")

        print(f"Data saved successfully at: {file_path}")
        return file_path

    except Exception as e:
        print(f"Failed to save data: {e}")
        return None

save_data(df_eda, "eda_data.csv")

Data saved successfully at: data_output\eda_data.csv


'data_output\\eda_data.csv'