# Data Pre-Processing

### Import Packages and CSV

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.pandas.set_option("display.max_columns", None)

In [4]:
# Create Dataframe
df = pd.read_csv(r"Visadataset.csv")

In [5]:
# Print shape of dataset
print(df.shape)

(25480, 12)


## Data Cleaning

- Handling Missing values
- Handling Missing values
- Handling Duplicates
- Check data type
- Understand the dataset

### Check Null Values

In [7]:
##these are the features with nan value
features_with_na=[features for features in df.columns if df[features].isnull().sum()>=1]
for feature in features_with_na:
    print(feature,np.round(df[feature].isnull().mean()*100,5), '% missing values')

In [8]:
features_with_na

[]

There are no null values in the dataset

### 3.2 Other Data Cleaning steps

Handling Duplicates

In [9]:
df.duplicated().sum()

0

No Duplicates in the dataset

Remove case_id from the dataset as it cannot used in Model Training

In [10]:
df.drop('case_id', inplace=True, axis=1)

# Feature Engineering

## Feature Extraction

In [11]:
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [13]:
# importing date class from datetime module
from datetime import date
  
# creating the date object of today's date
todays_date = date.today()
current_year= todays_date.year

In [14]:
current_year

2025

Subtract current year with year of estab to get company's age

In [15]:
df['company_age'] = current_year-df['yr_of_estab']

In [17]:
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age
0,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied,18
1,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified,23
2,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied,17
3,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied,128
4,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified,20


In [18]:
df.drop('yr_of_estab', inplace=True, axis=1)

## Type of Features

### Numeric Features

In [19]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))

Num of Numerical Features : 3


### Categorical Features

In [20]:
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Features :', len(cat_features))

Num of Categorical Features : 8


### Discrete features

In [21]:
discrete_features=[feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of Discrete Features :',len(discrete_features))

Num of Discrete Features : 0


### Continues Features

In [22]:
continuous_features=[feature for feature in num_features if feature not in discrete_features]
print('Num of Continuous Features :',len(continuous_features))

Num of Continuous Features : 3


### Split X and Y

- Split Dataframe to X and y
- Here we set a variable X i.e, independent columns, and a variable y i.e, dependent column as the “Case_Status” column.

In [23]:
X = df.drop('case_status', axis=1)
y = df['case_status']

In [24]:
y.head()

0       Denied
1    Certified
2       Denied
3       Denied
4    Certified
Name: case_status, dtype: object