# **EASY VISA**

### **DATA INGESTION AND PREPARATION**

In [2]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os


# Load the dataset
visa = pd.read_csv("https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/EasyVisa%20(1).csv")


In [3]:
# Display the first five rows of the dataset
visa.head(5)

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


#### **Check the size/shape of the dataset**

In [4]:
# Check the shape of the data
rows, cols = visa.shape

# Print the number of rows and columns
print(f'Number of rows: {rows}')
print(f'Number of columns: {cols}')

Number of rows: 25480
Number of columns: 12


#### **Check the data types of the columns**

In [5]:
# Check the info of the dataset
visa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   case_id                25480 non-null  object 
 1   continent              25480 non-null  object 
 2   education_of_employee  25480 non-null  object 
 3   has_job_experience     25480 non-null  object 
 4   requires_job_training  25480 non-null  object 
 5   no_of_employees        25480 non-null  int64  
 6   yr_of_estab            25480 non-null  int64  
 7   region_of_employment   25480 non-null  object 
 8   prevailing_wage        25480 non-null  float64
 9   unit_of_wage           25480 non-null  object 
 10  full_time_position     25480 non-null  object 
 11  case_status            25480 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 2.3+ MB


#### **Check for duplicates and missing values**

In [6]:
# Check for duplicates
visa.duplicated().sum()


np.int64(0)

In [7]:
# Create a copy the dataset
def create_copy():
    df_visa = visa.copy()
    return df_visa
create_copy()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.6500,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.8600,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.0300,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.3900,Year,Y,Certified
...,...,...,...,...,...,...,...,...,...,...,...,...
25475,EZYV25476,Asia,Bachelor's,Y,Y,2601,2008,South,77092.5700,Year,Y,Certified
25476,EZYV25477,Asia,High School,Y,N,3274,2006,Northeast,279174.7900,Year,Y,Certified
25477,EZYV25478,Asia,Master's,Y,N,1121,1910,South,146298.8500,Year,N,Certified
25478,EZYV25479,Asia,Master's,Y,Y,1918,1887,West,86154.7700,Year,Y,Certified


In [8]:
df_visa = create_copy()

In [None]:
# Check for missing values
def missing_col():
    # Check the columns in the dataset
    df_visa_cols = df_visa.columns
    for col in df_visa_cols:
        missing_cols = df_visa[col].isna().sum()
    # Check the percentage of the missing values
        percentage_missing = (missing_cols/len(df_visa))*100
        print(f"{col} : {missing_cols}")
        if percentage_missing >1<= 5:
            df_visa.dropna()
            print(f"Dropped missing values")
        elif percentage_missing >= 5:
            # Check the categorical col and fill with mode
            cat_col = [x for x in df_visa_cols if df_visa.dtype not in ['int64', 'float64']]
            if col in cat_col:
                df_visa[col] = df_visa[col].fillna(df_visa[col].mode()[0])
                print("Filled with mode")
            # Check the numerical col and fill with median
            num_col = [x for x in df_visa_cols if x not in cat_col]
            if col in num_col:
                df_visa[col] = df_visa[col].fillna(df_visa[col].median())
                print("Filled with median")
        elif percentage_missing == 0:
            print(f"No missing values")
        else:
            print(f"{col} : {missing_cols}")
missing_col()

case_id : 0
No missing values
continent : 0
No missing values
education_of_employee : 0
No missing values
has_job_experience : 0
No missing values
requires_job_training : 0
No missing values
no_of_employees : 0
No missing values
yr_of_estab : 0
No missing values
region_of_employment : 0
No missing values
prevailing_wage : 0
No missing values
unit_of_wage : 0
No missing values
full_time_position : 0
No missing values
case_status : 0
No missing values


In [10]:
# Set the case ID as the index
visa.set_index('case_id', inplace=True)

In [11]:
df_visa = create_copy()
df_visa.head(5)

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [12]:
# Save the cleaned dataset
def save_data():
    df_visa_cleaned = df_visa.to_csv('df_visa.csv', index=False)
    return df_visa_cleaned

In [13]:
save_data()