In [84]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
import os

In [85]:
# absolute path
current_file_path = os.path.abspath('C2-Essential-Pandas-Techniques-for-DataFrames/C2-01-Selection-and-Organization/C2_1_Selecting_and_Organizing_Columns.ipynb')

# up to 4 directories
root_dir = os.path.abspath(os.path.join(current_file_path, '../../../../../'))  # Subir 4 directorios

# dataset directory
dataset_dir = os.path.join(root_dir, 'datasets', 'visa-col-application-datagov-df')

for dirname, _, filenames in os.walk(dataset_dir):
    for filename in filenames:
        print(os.path.join(dirname, filename))

c:\Users\study_2025\Documents\Github\Doc-UP-AlejandroJaimes\Pandas-for-Education-Learning-through-Hands-On-Examples\datasets\visa-col-application-datagov-df\Visa_Applications_Colombia_2017_20250217.csv


In [86]:
visa_applications = pd.read_csv(os.path.join(dataset_dir, 'Visa_Applications_Colombia_2017_20250217.csv'))

In [87]:
# Normalize columns
new_columns = {'Año Solicitud': 'year_application', 'Nacionalidad': 'nationality', \
                'Sexo': 'gender', 'Fecha de Nacimiento': 'birth_date', 'Vocación de permanencia': 'permanet_stay_intent', \
                'Número': 'number_of_application'
            }
visa_applications.rename(columns=new_columns, inplace=True)
visa_applications.columns.tolist()

['year_application',
 'nationality',
 'gender',
 'birth_date',
 'permanet_stay_intent',
 'number_of_application']

#### **Exercise 1: Selecting Specific Columns**  
**Objective:** Learn how to select individual and multiple columns from a DataFrame.  

##### **Task:**  
1. Select and print only the `nationality` column.  
2. Select the `year_application` and `number_of_application` columns.  
3. Extract the first 10 rows of the `sex` column.  
4. Retrieve all visa applications where the `permanent_stay_intent` is `"Permanente"`.

In [88]:
# 1. Select and print only the nationality column.
visa_applications['nationality'].head()

0            ECUATORIANA
1    FEDERACION DE RUSIA
2               FRANCESA
3                 CUBANA
4         ESTADOUNIDENSE
Name: nationality, dtype: object

In [89]:
# 2. Select the year_application and number_of_application columns.  
visa_applications.loc[:, ['year_application', 'number_of_application']].head()

Unnamed: 0,year_application,number_of_application
0,2017,2
1,2017,2
2,2017,1
3,2017,2
4,2017,1


In [90]:
# 3. Extract the first 10 rows of the gender column.
visa_applications.loc[:10, ['gender']]

Unnamed: 0,gender
0,FEMENINO
1,FEMENINO
2,FEMENINO
3,FEMENINO
4,FEMENINO
5,FEMENINO
6,FEMENINO
7,FEMENINO
8,FEMENINO
9,FEMENINO


In [91]:
# 4. Retrieve all visa applications where the permanent_stay_intent is "Permanente".
def normalize_permanent_stay_intent(value:str):
    if value == 'Con vocación de permanencia':
        return 'Permanente'
    else:
        return 'Temporal'

visa_applications['permanet_stay_intent'] = visa_applications['permanet_stay_intent'].apply(normalize_permanent_stay_intent)
mask = visa_applications['permanet_stay_intent'] == 'Permanente'
permanent_stay_applications = visa_applications[mask]

In [92]:
total_psa = permanent_stay_applications.value_counts().sum()
print('Total Permanent Stay Applications:', total_psa)
permanent_stay_applications.head()

Total Permanent Stay Applications: 174909


Unnamed: 0,year_application,nationality,gender,birth_date,permanet_stay_intent,number_of_application
0,2017,ECUATORIANA,FEMENINO,24/07/1897,Permanente,2
13,2017,VENEZOLANA,FEMENINO,27/10/1927,Permanente,1
16,2017,VENEZOLANA,FEMENINO,20/01/1928,Permanente,1
39,2017,VENEZOLANA,FEMENINO,21/10/1930,Permanente,1
51,2017,VENEZOLANA,FEMENINO,10/03/1932,Permanente,1


#### **Exercise 2: Selecting Columns with Methods**  
**Objective:** Use DataFrame methods to retrieve column information.  

##### **Task:**  
1. Print the list of all column names in the dataset.  
2. Retrieve only the numerical columns from the dataset.  
3. Identify the total number of columns.  

In [93]:
#1. Print the list of all column names in the dataset.  
visa_applications.columns.tolist()

['year_application',
 'nationality',
 'gender',
 'birth_date',
 'permanet_stay_intent',
 'number_of_application']

In [94]:
# 2. Retrieve only the numerical columns from the dataset.
numerical_columns = visa_applications.select_dtypes(include=np.number)
numerical_columns.head()

Unnamed: 0,year_application,number_of_application
0,2017,2
1,2017,2
2,2017,1
3,2017,2
4,2017,1


In [95]:
# 3. Identify the total number of columns.
total_columns = visa_applications.columns.size
print('Total Columns:', total_columns)

Total Columns: 6


#### **Exercise 3: Reordering Columns**  
**Objective:** Change the order of DataFrame columns for better readability.  

##### **Task:**  
1. Reorder the dataset so that `number_of_application` appears first, followed by the rest of the columns.  
2. Move the `birth_date` column to be the last column.  
3. Swap the positions of `sex` and `nationality`.  


In [96]:
# 1. Reorder the dataset so that `number_of_application` appears first, followed by the rest of the columns.
columns = visa_applications.columns.tolist()[:-1]
columns.insert(0, 'number_of_application')
visa_applications = visa_applications[columns]
visa_applications.head()

Unnamed: 0,number_of_application,year_application,nationality,gender,birth_date,permanet_stay_intent
0,2,2017,ECUATORIANA,FEMENINO,24/07/1897,Permanente
1,2,2017,FEDERACION DE RUSIA,FEMENINO,03/05/1919,Temporal
2,1,2017,FRANCESA,FEMENINO,20/08/1919,Temporal
3,2,2017,CUBANA,FEMENINO,03/02/1922,Temporal
4,1,2017,ESTADOUNIDENSE,FEMENINO,17/11/1922,Temporal


In [97]:
# 2. Move the `birth_date` column to be the last column.
columns = visa_applications.columns.tolist()
columns.remove('birth_date')
columns.append('birth_date')
visa_applications = visa_applications[columns]
visa_applications.head()

Unnamed: 0,number_of_application,year_application,nationality,gender,permanet_stay_intent,birth_date
0,2,2017,ECUATORIANA,FEMENINO,Permanente,24/07/1897
1,2,2017,FEDERACION DE RUSIA,FEMENINO,Temporal,03/05/1919
2,1,2017,FRANCESA,FEMENINO,Temporal,20/08/1919
3,2,2017,CUBANA,FEMENINO,Temporal,03/02/1922
4,1,2017,ESTADOUNIDENSE,FEMENINO,Temporal,17/11/1922


In [98]:
# 3. Swap the positions of `gender` and `nationality`. 
columns = visa_applications.columns.tolist()

# swap positions
gender_pos = columns.index('gender')
del columns[gender_pos]
columns.insert(gender_pos, 'nationality')

nationality_pos = columns.index('nationality')
del columns[nationality_pos]
columns.insert(nationality_pos,'gender')

visa_applications = visa_applications[columns]
visa_applications.head()


Unnamed: 0,number_of_application,year_application,gender,nationality,permanet_stay_intent,birth_date
0,2,2017,FEMENINO,ECUATORIANA,Permanente,24/07/1897
1,2,2017,FEMENINO,FEDERACION DE RUSIA,Temporal,03/05/1919
2,1,2017,FEMENINO,FRANCESA,Temporal,20/08/1919
3,2,2017,FEMENINO,CUBANA,Temporal,03/02/1922
4,1,2017,FEMENINO,ESTADOUNIDENSE,Temporal,17/11/1922


#### **Exercise 4: Renaming Columns for Clarity**  
**Objective:** Modify column names to improve readability.  

##### **Task:**  
1. Convert all column names to lowercase (if not already).  
2. Rename:  
   - `permanent_stay_intent` → `visa_type`  
   - `number_of_application` → `total_applications`  
3. Replace underscores with spaces in column names.


In [99]:
# 1. Convert all column names to lowercase (if not already).  
visa_applications.columns = visa_applications.columns.str.lower()

In [100]:
# 2. Rename:  
#   - `permanent_stay_intent` → `visa_type`  
#   - `number_of_application` → `total_applications`  
rename_columns = {'permanet_stay_intent': 'visa_type', 'number_of_application': 'total_applications'}
visa_applications.rename(columns=rename_columns, inplace=True)

In [101]:
# 3. Replace underscores with spaces in column names.
visa_applications.columns = visa_applications.columns.str.replace('_', ' ')
visa_applications.head()

Unnamed: 0,total applications,year application,gender,nationality,visa type,birth date
0,2,2017,FEMENINO,ECUATORIANA,Permanente,24/07/1897
1,2,2017,FEMENINO,FEDERACION DE RUSIA,Temporal,03/05/1919
2,1,2017,FEMENINO,FRANCESA,Temporal,20/08/1919
3,2,2017,FEMENINO,CUBANA,Temporal,03/02/1922
4,1,2017,FEMENINO,ESTADOUNIDENSE,Temporal,17/11/1922


#### **Exercise 5: Filtering Based on Column Values**  
**Objective:** Extract specific rows based on column criteria.  

##### **Task:**  
1. Retrieve all applications where the `nationality` is `"Venezolana"`.  
2. Select all visa applications submitted after 2020.  
3. Extract records where the `number_of_application` is greater than 1000. 

In [102]:
visa_applications.columns = visa_applications.columns.str.replace(' ', '_')

In [103]:
# 1. Retrieve all applications where the `nationality` is `"Venezolana"`.  
mask = visa_applications['nationality'] == str.upper('Venezolana')
venezuelan_applications = visa_applications[mask]
total_va = venezuelan_applications.value_counts().sum()
print('Total Venezuelan Applications:', total_va)
venezuelan_applications.head()

Total Venezuelan Applications: 69093


Unnamed: 0,total_applications,year_application,gender,nationality,visa_type,birth_date
8,1,2017,FEMENINO,VENEZOLANA,Temporal,30/09/1925
13,1,2017,FEMENINO,VENEZOLANA,Permanente,27/10/1927
16,1,2017,FEMENINO,VENEZOLANA,Permanente,20/01/1928
17,1,2017,FEMENINO,VENEZOLANA,Temporal,04/02/1928
19,1,2017,FEMENINO,VENEZOLANA,Temporal,28/04/1928


In [104]:
# convert birth_date to datetime
visa_applications['birth_date'] = pd.to_datetime(visa_applications['birth_date'], dayfirst=True)
visa_applications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349583 entries, 0 to 349582
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   total_applications  349583 non-null  int64         
 1   year_application    349583 non-null  int64         
 2   gender              349576 non-null  object        
 3   nationality         349583 non-null  object        
 4   visa_type           349583 non-null  object        
 5   birth_date          349583 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 16.0+ MB


In [105]:
# 2. Select all visa applications submitted after 2020.  
mask = visa_applications['year_application'] > 2020
applications_after_2020 = visa_applications[mask]
total_a20 = applications_after_2020.value_counts().sum()
print('Total Applications After 2020:', total_a20)
applications_after_2020.head()

Total Applications After 2020: 101129


Unnamed: 0,total_applications,year_application,gender,nationality,visa_type,birth_date
181811,1,2021,FEMENINO,ALEMANA,Temporal,1974-11-25
183470,2,2021,FEMENINO,UCRANIANA,Permanente,1977-02-20
184124,1,2021,FEMENINO,MEXICANA,Temporal,1982-09-05
185134,1,2021,FEMENINO,CUBANA,Temporal,2007-09-18
191131,2,2021,FEMENINO,ESPAÑOLA,Temporal,2007-09-19


In [106]:
# 3. Extract records where the `number_of_application` is greater than 1000.
mask = visa_applications['total_applications'] > 1000
applications_grather_than_1000 = visa_applications[mask]
total_gt1000 = applications_grather_than_1000.value_counts().sum()
print('Total Applications Greater Than 1000:', total_gt1000)
applications_grather_than_1000.head()

Total Applications Greater Than 1000: 0


Unnamed: 0,total_applications,year_application,gender,nationality,visa_type,birth_date


#### **Exercise 6: Dropping Unnecessary Columns**  
**Objective:** Remove columns that may not be relevant for analysis.  

##### **Task:**  
1. Drop the `birth_date` column.  
2. Remove columns that contain only categorical data.  
3. Delete any column that contains more than 50% missing values.  

In [107]:
visa_app_copy = visa_applications.copy()

In [108]:
# 1. Drop the `birth_date` column.  
visa_app_copy.drop(columns=['birth_date'], inplace=True)
visa_app_copy.head()

Unnamed: 0,total_applications,year_application,gender,nationality,visa_type
0,2,2017,FEMENINO,ECUATORIANA,Permanente
1,2,2017,FEMENINO,FEDERACION DE RUSIA,Temporal
2,1,2017,FEMENINO,FRANCESA,Temporal
3,2,2017,FEMENINO,CUBANA,Temporal
4,1,2017,FEMENINO,ESTADOUNIDENSE,Temporal


In [109]:
# 2. Remove columns that contain only categorical data. 
cat_demographics = [
    'nationality',
    'gender'
]

cat_app_intent = ['visa_type']

cat_cols = (cat_demographics + cat_app_intent)
visa_app_copy.drop(columns=cat_cols, inplace=True)
visa_app_copy.head()

Unnamed: 0,total_applications,year_application
0,2,2017
1,2,2017
2,1,2017
3,2,2017
4,1,2017


In [115]:
# 3. Delete any column that contains more than 50% missing values.
visa_applications.isna().mean()

total_applications    0.00000
year_application      0.00000
gender                0.00002
nationality           0.00000
visa_type             0.00000
birth_date            0.00000
dtype: float64

In [116]:
mask = visa_applications.isna().mean() > 0.5
columns_to_drop = visa_applications.columns[mask]
visa_applications.drop(columns=columns_to_drop, inplace=True)
visa_applications.head()

Unnamed: 0,total_applications,year_application,gender,nationality,visa_type,birth_date
0,2,2017,FEMENINO,ECUATORIANA,Permanente,1897-07-24
1,2,2017,FEMENINO,FEDERACION DE RUSIA,Temporal,1919-05-03
2,1,2017,FEMENINO,FRANCESA,Temporal,1919-08-20
3,2,2017,FEMENINO,CUBANA,Temporal,1922-02-03
4,1,2017,FEMENINO,ESTADOUNIDENSE,Temporal,1922-11-17
