In [1]:
# ================================
# STEP 1: Import Required Libraries
# ================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# ================================
# STEP 2: Load the Dataset
# ================================
# Change the path if needed
df = pd.read_csv(r"C:\Users\MEGHAMALA\Downloads\EasyVisa.csv\EasyVisa.csv")

print("Initial Dataset Shape:", df.shape)
print(df.head())

# ================================
# STEP 3: Check Missing Values
# ================================
print("\nMissing Values:\n", df.isnull().sum())

# (Dataset has no missing values, so no imputation needed)

# ================================
# STEP 4: Encode Binary Categorical Columns
# ================================
binary_columns = [
    "has_job_experience",
    "requires_job_training",
    "full_time_position"
]

for col in binary_columns:
    df[col] = df[col].map({"Y": 1, "N": 0})

# ================================
# STEP 5: Encode Target Variable
# ================================
# Certified -> 1 (Approved)
# Denied -> 0 (Rejected)
df["case_status"] = df["case_status"].map({
    "Certified": 1,
    "Denied": 0
})

# ================================
# STEP 6: One-Hot Encode Categorical Features
# ================================
categorical_columns = [
    "continent",
    "education_of_employee",
    "region_of_employment",
    "unit_of_wage"
]

df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# ================================
# STEP 7: Feature Scaling (Numerical Columns)
# ================================
numerical_columns = [
    "no_of_employees",
    "yr_of_estab",
    "prevailing_wage"
]

scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# ================================
# STEP 8: Drop Irrelevant Columns
# ================================
# case_id is an identifier, not useful for ML
df.drop("case_id", axis=1, inplace=True)

# ================================
# STEP 9: Final Dataset Check
# ================================
print("\nFinal Dataset Shape:", df.shape)
print(df.head())

# ================================
# STEP 10: Save Preprocessed Dataset
# ================================
df.to_csv("EasyVisa_Preprocessed.csv", index=False)

print("\n✅ Data preprocessing completed successfully!")


Initial Dataset Shape: (25480, 12)
  case_id continent education_of_employee has_job_experience  \
0  EZYV01      Asia           High School                  N   
1  EZYV02      Asia              Master's                  Y   
2  EZYV03      Asia            Bachelor's                  N   
3  EZYV04      Asia            Bachelor's                  N   
4  EZYV05    Africa              Master's                  Y   

  requires_job_training  no_of_employees  yr_of_estab region_of_employment  \
0                     N            14513         2007                 West   
1                     N             2412         2002            Northeast   
2                     Y            44444         2008                 West   
3                     N               98         1897                 West   
4                     N             1082         2005                South   

   prevailing_wage unit_of_wage full_time_position case_status  
0         592.2029         Hour               

In [2]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv(r"C:\Users\MEGHAMALA\Downloads\EasyVisa.csv\EasyVisa.csv")

# ================================
# STEP 1: Check Missing Values
# ================================
print("Missing values before handling:\n")
print(df.isnull().sum())

# ================================
# STEP 2: Separate Column Types
# ================================
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = df.select_dtypes(include=["object"]).columns

# ================================
# STEP 3: Handle Missing Numerical Values
# ================================
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

# ================================
# STEP 4: Handle Missing Categorical Values
# ================================
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

# ================================
# STEP 5: Final Check
# ================================
print("\nMissing values after handling:\n")
print(df.isnull().sum())

print("\n✅ Missing value handling completed successfully!")


Missing values before handling:

case_id                  0
continent                0
education_of_employee    0
has_job_experience       0
requires_job_training    0
no_of_employees          0
yr_of_estab              0
region_of_employment     0
prevailing_wage          0
unit_of_wage             0
full_time_position       0
case_status              0
dtype: int64

Missing values after handling:

case_id                  0
continent                0
education_of_employee    0
has_job_experience       0
requires_job_training    0
no_of_employees          0
yr_of_estab              0
region_of_employment     0
prevailing_wage          0
unit_of_wage             0
full_time_position       0
case_status              0
dtype: int64

✅ Missing value handling completed successfully!


In [3]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv(r"C:\Users\MEGHAMALA\Downloads\EasyVisa.csv\EasyVisa.csv")

# -------------------------------
# STEP 1: Encode Target (if not done)
# -------------------------------
df["case_status"] = df["case_status"].map({
    "Certified": 1,
    "Denied": 0
})

# -------------------------------
# STEP 2: Generate Processing Time
# -------------------------------
def generate_processing_time(row):
    
    # Strong profile conditions
    if (row["case_status"] == 1 and 
        row["has_job_experience"] == "Y" and
        row["full_time_position"] == "Y" and
        row["prevailing_wage"] > 60000):
        return np.random.randint(7, 20)
    
    # Average profile
    elif row["case_status"] == 1:
        return np.random.randint(15, 35)
    
    # Weak / denied cases
    else:
        return np.random.randint(30, 60)

# Apply function
df["processing_time_days"] = df.apply(generate_processing_time, axis=1)

# -------------------------------
# STEP 3: Verify Output
# -------------------------------
print(df[["case_status", "processing_time_days"]].head())
print("\nProcessing Time Statistics:")
print(df["processing_time_days"].describe())

# -------------------------------
# STEP 4: Save Dataset
# -------------------------------
df.to_csv("EasyVisa_with_Processing_Time.csv", index=False)

print("\n✅ Processing time target label generated successfully!")


   case_status  processing_time_days
0            0                    39
1            1                    13
2            0                    50
3            0                    31
4            1                    14

Processing Time Statistics:
count    25480.000000
mean        28.640149
std         13.703208
min          7.000000
25%         17.000000
50%         27.000000
75%         37.000000
max         59.000000
Name: processing_time_days, dtype: float64

✅ Processing time target label generated successfully!
