In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
df = pd.read_csv("visa_data.csv")
print(df)

     Application_ID Applicant_Country Visa_Type Processing_Office  \
0              1001               USA      Work            London   
1              1002            Canada   Tourist               NaN   
2              1003            France   Tourist           Toronto   
3              1004            Canada   Student         New Delhi   
4              1005           Germany      Work         Hyderabad   
..              ...               ...       ...               ...   
495            1496               NaN   Student               NaN   
496            1497            Canada   Tourist         Hyderabad   
497            1498           Germany   Tourist            Sydney   
498            1499         Australia   Student        California   
499            1500               NaN      Work         Hyderabad   

    Application_Date Decision_Date  Applicant_Age  Experience_Years  
0         2022-03-19    2022-06-26             62              22.0  
1         2022-06-06    2022-08

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Application_ID     500 non-null    int64  
 1   Applicant_Country  432 non-null    object 
 2   Visa_Type          500 non-null    object 
 3   Processing_Office  433 non-null    object 
 4   Application_Date   500 non-null    object 
 5   Decision_Date      493 non-null    object 
 6   Applicant_Age      500 non-null    int64  
 7   Experience_Years   247 non-null    float64
dtypes: float64(1), int64(2), object(5)
memory usage: 31.4+ KB


In [4]:
df.isnull().sum()

Application_ID         0
Applicant_Country     68
Visa_Type              0
Processing_Office     67
Application_Date       0
Decision_Date          7
Applicant_Age          0
Experience_Years     253
dtype: int64

In [5]:
#categorical
df['Applicant_Country'] = df['Applicant_Country'].fillna("Unknown")
df['Processing_Office'] = df['Processing_Office'].fillna("Unknown")
print(df)

     Application_ID Applicant_Country Visa_Type Processing_Office  \
0              1001               USA      Work            London   
1              1002            Canada   Tourist           Unknown   
2              1003            France   Tourist           Toronto   
3              1004            Canada   Student         New Delhi   
4              1005           Germany      Work         Hyderabad   
..              ...               ...       ...               ...   
495            1496           Unknown   Student           Unknown   
496            1497            Canada   Tourist         Hyderabad   
497            1498           Germany   Tourist            Sydney   
498            1499         Australia   Student        California   
499            1500           Unknown      Work         Hyderabad   

    Application_Date Decision_Date  Applicant_Age  Experience_Years  
0         2022-03-19    2022-06-26             62              22.0  
1         2022-06-06    2022-08

In [6]:
#numerical
df['Experience_Years'] = df['Experience_Years'].fillna(df['Experience_Years'].median())
print(df)

     Application_ID Applicant_Country Visa_Type Processing_Office  \
0              1001               USA      Work            London   
1              1002            Canada   Tourist           Unknown   
2              1003            France   Tourist           Toronto   
3              1004            Canada   Student         New Delhi   
4              1005           Germany      Work         Hyderabad   
..              ...               ...       ...               ...   
495            1496           Unknown   Student           Unknown   
496            1497            Canada   Tourist         Hyderabad   
497            1498           Germany   Tourist            Sydney   
498            1499         Australia   Student        California   
499            1500           Unknown      Work         Hyderabad   

    Application_Date Decision_Date  Applicant_Age  Experience_Years  
0         2022-03-19    2022-06-26             62              22.0  
1         2022-06-06    2022-08

In [7]:
# decision date is mandatory for target calculation
df.dropna(subset=['Decision_Date'], inplace=True)
print(df)

     Application_ID Applicant_Country Visa_Type Processing_Office  \
0              1001               USA      Work            London   
1              1002            Canada   Tourist           Unknown   
2              1003            France   Tourist           Toronto   
3              1004            Canada   Student         New Delhi   
4              1005           Germany      Work         Hyderabad   
..              ...               ...       ...               ...   
495            1496           Unknown   Student           Unknown   
496            1497            Canada   Tourist         Hyderabad   
497            1498           Germany   Tourist            Sydney   
498            1499         Australia   Student        California   
499            1500           Unknown      Work         Hyderabad   

    Application_Date Decision_Date  Applicant_Age  Experience_Years  
0         2022-03-19    2022-06-26             62              22.0  
1         2022-06-06    2022-08

In [8]:
df['Application_Date'] = pd.to_datetime(df['Application_Date'], errors='coerce')
df['Decision_Date'] = pd.to_datetime(df['Decision_Date'], errors='coerce')

df.dropna(subset=['Application_Date', 'Decision_Date'], inplace=True)
print(df)


     Application_ID Applicant_Country Visa_Type Processing_Office  \
0              1001               USA      Work            London   
1              1002            Canada   Tourist           Unknown   
2              1003            France   Tourist           Toronto   
3              1004            Canada   Student         New Delhi   
4              1005           Germany      Work         Hyderabad   
..              ...               ...       ...               ...   
495            1496           Unknown   Student           Unknown   
496            1497            Canada   Tourist         Hyderabad   
497            1498           Germany   Tourist            Sydney   
498            1499         Australia   Student        California   
499            1500           Unknown      Work         Hyderabad   

    Application_Date Decision_Date  Applicant_Age  Experience_Years  
0         2022-03-19    2022-06-26             62              22.0  
1         2022-06-06    2022-08

In [9]:
#removing invalid age values
df = df[(df['Applicant_Age'] > 0) & (df['Applicant_Age'] < 100)]
print(df)

     Application_ID Applicant_Country Visa_Type Processing_Office  \
0              1001               USA      Work            London   
4              1005           Germany      Work         Hyderabad   
5              1006           Germany   Student           Unknown   
6              1007               USA   Tourist            London   
7              1008           Germany   Student         Hyderabad   
..              ...               ...       ...               ...   
495            1496           Unknown   Student           Unknown   
496            1497            Canada   Tourist         Hyderabad   
497            1498           Germany   Tourist            Sydney   
498            1499         Australia   Student        California   
499            1500           Unknown      Work         Hyderabad   

    Application_Date Decision_Date  Applicant_Age  Experience_Years  
0         2022-03-19    2022-06-26             62              22.0  
4         2022-09-03    2022-09

In [10]:
#Create Target Variable
df['Processing_Time_Days'] = (df['Decision_Date'] - df['Application_Date']).dt.days
print("\n After calculating processing days:\n", df)
print(df)


 After calculating processing days:
      Application_ID Applicant_Country Visa_Type Processing_Office  \
0              1001               USA      Work            London   
4              1005           Germany      Work         Hyderabad   
5              1006           Germany   Student           Unknown   
6              1007               USA   Tourist            London   
7              1008           Germany   Student         Hyderabad   
..              ...               ...       ...               ...   
495            1496           Unknown   Student           Unknown   
496            1497            Canada   Tourist         Hyderabad   
497            1498           Germany   Tourist            Sydney   
498            1499         Australia   Student        California   
499            1500           Unknown      Work         Hyderabad   

    Application_Date Decision_Date  Applicant_Age  Experience_Years  \
0         2022-03-19    2022-06-26             62             

In [11]:
# Remove negative processing times
df = df[df['Processing_Time_Days'] >= 0]
print(df)


     Application_ID Applicant_Country Visa_Type Processing_Office  \
0              1001               USA      Work            London   
4              1005           Germany      Work         Hyderabad   
5              1006           Germany   Student           Unknown   
6              1007               USA   Tourist            London   
7              1008           Germany   Student         Hyderabad   
..              ...               ...       ...               ...   
495            1496           Unknown   Student           Unknown   
496            1497            Canada   Tourist         Hyderabad   
497            1498           Germany   Tourist            Sydney   
498            1499         Australia   Student        California   
499            1500           Unknown      Work         Hyderabad   

    Application_Date Decision_Date  Applicant_Age  Experience_Years  \
0         2022-03-19    2022-06-26             62              22.0   
4         2022-09-03    2022-

In [12]:
df.drop(columns=['Application_ID'], inplace=True , errors= 'ignore')
print(df)

    Applicant_Country Visa_Type Processing_Office Application_Date  \
0                 USA      Work            London       2022-03-19   
4             Germany      Work         Hyderabad       2022-09-03   
5             Germany   Student           Unknown       2022-11-28   
6                 USA   Tourist            London       2022-03-25   
7             Germany   Student         Hyderabad       2022-04-04   
..                ...       ...               ...              ...   
495           Unknown   Student           Unknown       2022-06-07   
496            Canada   Tourist         Hyderabad       2022-02-04   
497           Germany   Tourist            Sydney       2022-10-12   
498         Australia   Student        California       2022-08-05   
499           Unknown      Work         Hyderabad       2022-03-03   

    Decision_Date  Applicant_Age  Experience_Years  Processing_Time_Days  
0      2022-06-26             62              22.0                    99  
4      20

In [13]:
df_encoded = pd.get_dummies(df, columns=['Applicant_Country', 'Visa_Type', 'Processing_Office'])
print("\n Encoded DataFrame:\n", df_encoded)


 Encoded DataFrame:
     Application_Date Decision_Date  Applicant_Age  Experience_Years  \
0         2022-03-19    2022-06-26             62              22.0   
4         2022-09-03    2022-09-26             60              24.0   
5         2022-11-28    2022-12-08             75              19.0   
6         2022-03-25    2022-06-30             63               4.0   
7         2022-04-04    2022-04-04             71              12.0   
..               ...           ...            ...               ...   
495       2022-06-07    2022-09-23             69              12.0   
496       2022-02-04    2022-04-03             75              12.0   
497       2022-10-12    2023-01-10             92              21.0   
498       2022-08-05    2022-12-02             39              21.0   
499       2022-03-03    2022-04-30             96              12.0   

     Processing_Time_Days  Applicant_Country_Australia  \
0                      99                        False   
4        