## Some Exploratory Data Analysis


In [1]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier # GradientBoostingClassifier Or use XGBoost / LightGBM
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer


DATA_DIR = 'data'

edu_train_csv = 'module_Education_train_set.csv'
edu_test_csv = 'module_Education_test_set.csv'
house_train_csv = 'module_HouseholdInfo_train_set.csv'
house_test_csv = 'module_HouseholdInfo_test_set.csv'
pov_train_csv = 'module_SubjectivePoverty_train_set.csv'
# Load datasets
edu_train = pd.read_csv(os.path.join(DATA_DIR, edu_train_csv))
house_train = pd.read_csv(os.path.join(DATA_DIR, house_train_csv))
pov_train = pd.read_csv(os.path.join(DATA_DIR, pov_train_csv))

edu_test = pd.read_csv(os.path.join(DATA_DIR, edu_test_csv))
house_test = pd.read_csv(os.path.join(DATA_DIR, house_test_csv))

print("==============================edu train================================")
print(edu_train.info())
# Print showing all columns
with pd.option_context('display.max_columns', None):
    print(edu_train.head)

print("==============================house train================================")

print(house_train.info())
with pd.option_context('display.max_columns', None):
    print(house_train.head)

print("==============================edu test================================")
print(edu_test.info())
with pd.option_context('display.max_columns', None):
    print(edu_test.head)

print("==============================house test================================")

print(house_test.info())
with pd.option_context('display.max_columns', None):
    print(house_test.head)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22406 entries, 0 to 22405
Data columns (total 69 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   psu     22406 non-null  int64  
 1   hh      22406 non-null  int64  
 2   idcode  22406 non-null  int64  
 3   q01     22406 non-null  int64  
 4   q02     22406 non-null  int64  
 5   q03     22406 non-null  int64  
 6   q04     21513 non-null  float64
 7   q05     21513 non-null  float64
 8   q06     21513 non-null  float64
 9   q07     21513 non-null  float64
 10  Q08     21513 non-null  float64
 11  Q09     5565 non-null   float64
 12  Q10     39 non-null     float64
 13  Q11     15948 non-null  float64
 14  Q12     5526 non-null   float64
 15  Q13     5526 non-null   float64
 16  Q14     21513 non-null  float64
 17  Q15     5625 non-null   float64
 18  Q16     35 non-null     float64
 19  Q17     15888 non-null  float64
 20  Q18     15888 non-null  float64
 21  Q19     15923 non-null  float64
 22

In [2]:
# Filler value for logically skipped ones
filler_value = -999

# Define the conditions dictionary for each column of the education module
# Format: { column_name: { 'value': (skip_to_column_index, action_type) } }
education_conditions = {
    'q03': {2: 'skip_row', 'NO': 'skip_row'},
    'Q08': {2: 'Q11', 'NO': 'Q11'},
    'Q09': {1: 'Q12', 'YES': 'Q12'},
    'Q10': {range(1, 14): 'Q14'},
    'Q11': {range(1, 15): 'Q14'},
    'Q14': {2: 'Q17', 'NO': 'Q17'},
    'Q15': {1: 'Q21', 'YES': 'Q21'},
    'Q16': {range(1, 14): 'Q20'},
    'Q19': {2: 'skip_row', 'NO': 'skip_row'},
    'Q20': {2: 'skip_row', 'NO': 'skip_row'},
    'Q24': {range(1, 6): 'Q26', 'ABROAD': 'Q33', 999: 'Q33'},
    'Q28': {range(1, 4): 'Q32', 'WALK': 'Q32', 'BICYCLE': 'Q32', 'ANIMAL': 'Q32'},
    'Q30': {2: 'Q32', 'NO': 'Q32'},
    'Q43': {1: 'Q45', 'YES': 'Q45'},
    'Q46': {2: 'Q50', 'NO': 'Q50'},
    'Q48': {4: 'Q50', 'STILL HAVE NOT RECEIVED THE SUBSIDY': 'Q50'},
    'Q50': {2: 'Q57', 'NO': 'Q57'},
    'Q54': {2: 'Q57', 'NO': 'Q57'},
    'Q57': {2: 'Q59', 'NO': 'Q59'},
    'Q59': {2: 'Q61', 'NO': 'Q61'},
    'Q61': {2: 'Q64', 'NO': 'Q64'},
    'Q64': {2: 'Q66', 'NO': 'Q66'},
}

house_conditions = {
    'q05y': {range(12): 'q09'},
    'q06': {range(4,6): 'q09', 'WIDOW/ER': 'q09', 'SINGLE': 'q09'},
    'q07': {2: 'q09', 'NO': 'q09'},
    'q11': {2: 'q13', 'NO': 'q13'},
    'q12': {'not_null': 'q17'},
    'q14': {1: 'q16', 'YES': 'q16'},
    'q15': {'not_null': 'q17'},
    'q17': {2: 'q19', 'NO': 'q19'},
    'q18': {'not_null': 'skip_row'},
    'q20': {1: 'q22', 'YES': 'q22'},
    'q21': {'not_null': 'skip_row'},
}


# Function to apply conditions based on the dictionary with column names on the education module.
# Define new category (filler_value), for those which should be skipped.
def apply_conditions(df, conditions):
    def apply_action(df, idx, col_name, action):
        """Helper function to apply the specified action."""
        start_col_index = df.columns.get_loc(col_name)
        if action == 'skip_row':
            # Set all columns to the right (from col_name to end of row) to filler_value
            df.loc[idx, df.columns[start_col_index+1:]] = filler_value
            return True # Indicate that the row should be skipped
        elif isinstance(action, str) and action in df.columns:
            # Skip to a specific column within the row and set cells to 'skipped'
            end_col_index = df.columns.get_loc(action)
            df.loc[idx, df.columns[start_col_index+1:end_col_index]] = filler_value
            return False

    for idx, row in df.iterrows():
        for col_name, condition in conditions.items():
            if col_name in df.columns:
                value = row[col_name]
                
                # Check if there’s a condition for the value in this column
                for cond_value, action in condition.items():
                    if isinstance(cond_value, range):
                        if value in cond_value:
                            if apply_action(df, idx, col_name, action):
                                break
                        elif value == filler_value or pd.isna(value):
                            continue
                    elif (value == cond_value) or (cond_value == 'not_null' and pd.notna(value)):
                        if apply_action(df, idx, col_name, action):
                            break
    return df

# Apply the conditions to the DataFrame
df_edu = apply_conditions(edu_train, education_conditions)
df_house = apply_conditions(house_train, house_conditions)
df_edu_test = apply_conditions(edu_test, education_conditions)
df_house_test = apply_conditions(house_test, house_conditions)

print("==============================edu train================================")
print(df_edu.info())
# Print showing all columns
with pd.option_context('display.max_columns', None):
    print(df_edu.head)

print("==============================house train================================")

print(df_house.info())
with pd.option_context('display.max_columns', None):
    print(df_house.head)

print("==============================edu test================================")
print(df_edu_test.info())
with pd.option_context('display.max_columns', None):
    print(df_edu_test.head)

print("==============================house test================================")

print(df_house_test.info())
with pd.option_context('display.max_columns', None):
    print(df_house_test.head)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22406 entries, 0 to 22405
Data columns (total 69 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   psu     22406 non-null  int64  
 1   hh      22406 non-null  int64  
 2   idcode  22406 non-null  int64  
 3   q01     22406 non-null  int64  
 4   q02     22406 non-null  int64  
 5   q03     22406 non-null  int64  
 6   q04     22406 non-null  float64
 7   q05     22406 non-null  float64
 8   q06     22406 non-null  float64
 9   q07     22406 non-null  float64
 10  Q08     22406 non-null  float64
 11  Q09     22406 non-null  float64
 12  Q10     22406 non-null  float64
 13  Q11     22406 non-null  float64
 14  Q12     22406 non-null  float64
 15  Q13     22406 non-null  float64
 16  Q14     22406 non-null  float64
 17  Q15     22406 non-null  float64
 18  Q16     22406 non-null  float64
 19  Q17     22406 non-null  float64
 20  Q18     22406 non-null  float64
 21  Q19     22406 non-null  float64
 22

In [3]:
# Merge training datasets on 'psu', 'hh' and 'idcode'
train_data = pd.merge(edu_train, house_train, on=['psu', 'hh', 'idcode'], how='outer', suffixes=('_edu', '_hh'))
# Create the 'psu_hh_idcode' column after the merge
train_data['psu_hh_idcode'] = train_data['psu'].astype(str) + '_' + train_data['hh'].astype(str) + '_' + train_data['idcode'].astype(str)
train_data = pd.merge(train_data, pov_train, on='psu_hh_idcode', how='right') # check inner later.
# Drop the original 'psu', 'hh', and 'idcode' columns
train_data = train_data.drop(['psu', 'hh', 'idcode'], axis=1)
# Make 'psu_hh_idcode' the first column
columns = ['psu_hh_idcode'] + [col for col in train_data.columns if col != 'psu_hh_idcode']
train_data = train_data[columns]


# Merge test datasets on 'psu', 'hh' and 'idcode'
test_data = pd.merge(edu_test, house_test, on=['psu', 'hh', 'idcode'], how='outer', suffixes=('_edu', '_hh'))
# Create the 'psu_hh_idcode' column after the merge
test_data['psu_hh_idcode'] = test_data['psu'].astype(str) + '_' + test_data['hh'].astype(str) + '_' + test_data['idcode'].astype(str)
# Drop the original 'psu', 'hh', and 'idcode' columns
test_data = test_data.drop(['psu', 'hh', 'idcode'], axis=1)
# Make 'psu_hh_idcode' the first column
columns = ['psu_hh_idcode'] + [col for col in test_data.columns if col != 'psu_hh_idcode']
test_data = test_data[columns]

print(train_data.info())
# # Print showing all columns
# with pd.option_context('display.max_columns', None):
#     print(train_data.head)
#     print(test_data.head)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5337 entries, 0 to 5336
Data columns (total 100 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   psu_hh_idcode          5337 non-null   object 
 1   q01                    5334 non-null   float64
 2   q02_edu                5334 non-null   float64
 3   q03_edu                5334 non-null   float64
 4   q04_edu                5334 non-null   float64
 5   q05                    5334 non-null   float64
 6   q06_edu                5334 non-null   float64
 7   q07_edu                5334 non-null   float64
 8   Q08                    5334 non-null   float64
 9   Q09                    5334 non-null   float64
 10  Q10                    5334 non-null   float64
 11  Q11                    5334 non-null   float64
 12  Q12                    5334 non-null   float64
 13  Q13                    5334 non-null   float64
 14  Q14                    5334 non-null   float64
 15  Q15

In [4]:
# # Drop rows with any NaN values
# train_data = train_data.dropna()

# Convert the one-hot encoded columns into a single target column
# Find the target level by looking for the column with the value 1 in each row
target_columns = [f'subjective_poverty_{i}' for i in range(1, 11)]
train_data['target'] = train_data[target_columns].idxmax(axis=1).str.extract(r'(\d+)').astype(int)

# Drop the one-hot encoded columns as they are now redundant
train_data = train_data.drop(target_columns, axis=1)

# Feature-target split
X = train_data.drop(['psu_hh_idcode', 'target'], axis=1)
y = train_data['target'] - 1  # Adjust to 0-based index for most classifiers

# Print the transformed data for verification
print(X.head())
print(y.head())

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize model (example with Gradient Boosting, switch to XGBoost or LightGBM for better performance)
# model = GradientBoostingClassifier()
# Initialize HistGradientBoostingClassifier (handles NaNs natively)
model = HistGradientBoostingClassifier()
model.fit(X_train, y_train)

# Predict probabilities on validation set
y_val_pred = model.predict_proba(X_val)
val_log_loss = log_loss(y_val, y_val_pred)

# Predict on test data
test_preds = model.predict_proba(test_data.drop(['psu_hh_idcode'], axis=1))

# Prepare submission
submission = pd.DataFrame(test_preds, columns=[f'subjective_poverty_{i}' for i in range(1, 11)])
submission.insert(0, 'psu_hh_idcode', test_data['psu_hh_idcode'])
submission.to_csv('submission.csv', index=False)

   q01  q02_edu  q03_edu  q04_edu  q05  q06_edu  q07_edu  Q08    Q09    Q10  \
0  1.0      1.0      1.0      1.0  8.0      2.0      1.0  2.0 -999.0 -999.0   
1  1.0      1.0      1.0      1.0  8.0      2.0      0.0  2.0 -999.0 -999.0   
2  1.0      1.0      1.0      1.0  8.0      2.0      0.0  2.0 -999.0 -999.0   
3  1.0      1.0      1.0      1.0  8.0      2.0      0.0  2.0 -999.0 -999.0   
4  1.0      1.0      1.0      1.0  4.0      1.0      0.0  2.0 -999.0 -999.0   

   ...  q13  q14    q15    q16  q17    q18  q19  q20    q21    q22  
0  ...  2.0  1.0 -999.0   66.0  2.0 -999.0  3.0  1.0 -999.0   63.0  
1  ...  1.0  1.0 -999.0   67.0  2.0 -999.0  1.0  1.0 -999.0   75.0  
2  ...  1.0  2.0   86.0 -999.0  2.0 -999.0  1.0  2.0   74.0 -999.0  
3  ...  2.0  2.0   81.0 -999.0  2.0 -999.0  2.0  2.0   77.0 -999.0  
4  ...  1.0  2.0   80.0 -999.0  2.0 -999.0  1.0  2.0   72.0 -999.0  

[5 rows x 89 columns]
0    3
1    0
2    2
3    4
4    3
Name: target, dtype: int64


In [None]:
# Check missing values in training data
missing_train = train_data.isnull().sum()
print("Missing values in training data:\n", missing_train)

# Check missing values in testing data
missing_test = test_data.isnull().sum()
print("Missing values in testing data:\n", missing_test)

In [None]:
# Create dictionaries to map codes to categories for the house hold info module.
# Sex
sex_mapping = {
    1: 'Male',
    2: 'Female'
}

# Relationship to Head
relationship_to_head_mapping = {
    1: 'Head',
    2: 'Spouse',
    3: 'Partner',
    4: 'Child/Adopted Child',
    5: 'Grandchild',
    6: 'Niece/Nephew',
    7: 'Father/Mother',
    8: 'Sister/Brother',
    9: 'Son/Daughter-in-Law',
    10: 'Brother/Sister-in-Law',
    11: 'Grandfather/Mother',
    12: 'Father/Mother-in-Law',
    13: 'Other Relative',
    14: 'Not Related'
}

# Present Marital Status
marital_status_mapping = {
    1: 'Married',
    2: 'Divorced/Separated',
    3: 'Living Together',
    4: 'Widow/er',
    5: 'Single'
}

# Spouse/Partner in Household
spouse_in_household_mapping = {
    1: 'Yes',
    2: 'No'
}

# Household Member Present
household_member_present_mapping = {
    1: 'Yes',
    2: 'No'
}

# Natural Mother in Household
natural_mother_in_household_mapping = {
    1: 'Yes',
    2: 'No'
}

# Highest Educational Level of Mother
mother_education_mapping = {
    1: 'None, or Some Primary',
    2: 'Completed Primary 4/5 Years',
    3: 'Completed Primary 7/8/9 Years',
    4: 'Some Secondary General',
    5: 'Completed Secondary',
    6: 'Some Vocational School',
    7: 'Completed Vocational School',
    8: 'Some University',
    9: 'Completed University Degree',
    10: 'Post-University',
    'DK': "Don't Know"
}

# Mother Still Living
mother_still_living_mapping = {
    1: 'Yes',
    2: 'No'
}

# Natural Father in Household
natural_father_in_household_mapping = {
    1: 'Yes',
    2: 'No'
}

# Highest Educational Level of Father
father_education_mapping = {
    1: 'None, or Some Primary',
    2: 'Completed Primary 4/5 Years',
    3: 'Completed Primary 7/8/9 Years',
    4: 'Some Secondary General',
    5: 'Completed Secondary',
    6: 'Some Vocational School',
    7: 'Completed Vocational School',
    8: 'Some University',
    9: 'Completed University Degree',
    10: 'Post-University',
    'DK': "Don't Know"
}

# Father Still Living
father_still_living_mapping = {
    1: 'Yes',
    2: 'No'
}

# Create dictionaries to map codes to categories for the education module.
# Can [Name] read the newspaper?
reading_ability_mapping = {
    1: 'Yes, Easily',
    2: 'Yes, With Difficulty',
    3: 'No'
}

# Can [Name] write a one-page personal letter?
writing_ability_mapping = {
    1: 'Yes, Easily',
    2: 'Yes, With Difficulty',
    3: 'No'
}

# Has [Name] ever attended school?
attended_school_mapping = {
    1: 'Yes',
    2: 'No'
}

# What is the highest grade [Name] has completed in school? In which level?
# Mapping for the education level
education_level_mapping = {
    0: 'None',
    1: '"8 OR 9 YEARS" School',
    2: 'Gymnazium (Secondary General)',
    3: 'Technicum < 2 Years',
    4: 'Vocational 2-3 Years',
    5: 'Vocational 4/5 Years',
    6: 'University - Albania',
    7: 'University - Abroad',
    8: 'Master - Albania',
    9: 'Master - Abroad',
    10: 'Doctorate/PhD - Albania',
    11: 'Doctorate/PhD - Abroad'
}

# Mapping for the grade range within each level
education_grade_mapping = {
    1: '1-9',
    2: '1-4',
    3: '1-2',
    4: '1-3',
    5: '1-5',
    6: '1-6'
}

# What is the highest diploma [Name] has attained?
highest_diploma_mapping = {
    0: 'None',
    1: 'Primary 4 Years',
    2: 'Primary 8/9 Years',
    3: 'Gymnazium (Secondary General)',
    4: 'Technicum < 2 Years',
    5: 'Vocational 2-3 Years',
    6: 'Vocational 4/5 Years',
    7: 'Tertiary (BA)',
    8: 'Tertiary (BA/MA)',
    9: 'Tertiary (Old System before Bologna)',
    10: 'Post-Graduate/Master',
    11: 'Doctorate/PhD'
}

# Did [Name] enroll or plan to enroll in school this academic year?
enrollment_plan_mapping = {
    1: 'Yes',
    2: 'No'
}

# Is [Name] going to attend school this academic year?
school_attendance_mapping = {
    1: 'Yes',
    2: 'No'
}

# Why is [Name] not attending school?
non_attendance_reason_mapping = {
    1: 'Too Expensive',
    2: 'No Interest',
    3: 'Agricultural Work',
    4: 'Other Work',
    5: 'School Too Far',
    6: 'Poor Teaching',
    7: 'Poor Facilities',
    8: 'Own Illness',
    9: 'Family Illness/Death',
    10: 'Moved',
    11: 'Safety',
    12: 'Got Married',
    13: 'Other (Specify)'
}

# Why didn't [Name] enroll in school this year?
non_enrollment_reason_mapping = {
    1: 'Too Expensive',
    2: 'No Interest',
    3: 'Agricultural Work',
    4: 'Other Work',
    5: 'School Too Far',
    6: 'Poor Teaching',
    7: 'Poor Facilities',
    8: 'Own Illness',
    9: 'Family Illness/Death',
    10: 'Moved',
    11: 'Safety',
    12: 'Got Married',
    13: 'Completed Studies',
    14: 'Other (Specify)'
}

# In what grade is [Name] currently enrolled? In which level?
# Mapping for the education level
current_education_level_mapping = {
    1: '"8 OR 9 YEARS" School',
    2: 'Technicum < 2 Years',
    3: 'Gymnazium (Secondary General)',
    4: 'Vocational 2-3 Years',
    5: 'Vocational 4/5 Years',
    6: 'Tertiary (BA)',
    7: 'Tertiary (BA/MA)',
    8: 'Tertiary (Old System before Bologna)',
    9: 'Post-Graduate/Master',
    10: 'Doctorate/PhD'
}
# Mapping for the grade range within each level
current_education_grade_mapping = {
    1: '1-9',
    2: '1-2',
    3: '1-4',
    4: '1-3',
    5: '1-5',
    6: '1-3',
    7: '1-2',
    8: '1-6',
    9: '1-5'
}

# Did [NAME] enroll in the past academic year?
past_year_school_enrollment_mapping = {
    1: 'Yes',
    2: 'No'
}

# Did [Name] attend school in the past academic year?
past_year_school_attendance_mapping = {
    1: 'Yes',
    2: 'No'
}

# Why didn't [Name] attend school in the past academic year?
past_year_non_attendance_reason_mapping = {
    1: 'Too Expensive',
    2: 'No Interest',
    3: 'Agricultural Work',
    4: 'Other Work',
    5: 'School Too Far',
    6: 'Poor Teaching',
    7: 'Poor Facilities',
    8: 'Own Illness',
    9: 'Family Illness/Death',
    10: 'Moved',
    11: 'Safety',
    12: 'Got Married',
    13: 'Other (Specify)'
}

# Why didn't [Name] enroll in school in the past academic year?
past_year_non_enrollment_reason_mapping = {
    1: 'Too Expensive',
    2: 'No Interest',
    3: 'Agricultural Work',
    4: 'Other Work',
    5: 'School Too Far',
    6: 'Poor Teaching',
    7: 'Poor Facilities',
    8: 'Own Illness',
    9: 'Family Illness/Death',
    10: 'Moved',
    11: 'Safety',
    12: 'Got Married',
    13: 'Completed Studies',
    14: 'Other (Specify)'
}

# IS [NAME] 19 YEARS OLD OR LESS?
age_19_mapping = {
    1: 'Yes',
    2: 'No'
}

# Does [Name] intend to return to school?
intends_to_return_to_school_mapping = {
    1: 'Yes',
    2: 'No'
}

# Is the school that [Name] attends public or private?
school_type_mapping = {
    1: 'Public',
    2: 'Private - Religious',
    3: 'Private - Non Religious'
}

# Does [Name] usually stay in another location closer to your school during the school term?
stay_another_location_during_term_mapping = {
    1: 'Yes',
    2: 'No'
}

# How does [Name] generally go to school?
transportation_mode_mapping = {
    1: 'Walk',
    2: 'Bicycle',
    3: 'Animal',
    4: 'Car',
    5: 'Bus/Minibus',
    6: 'Train',
    7: 'Other (Specify)'
}

# Has [Name] received a transportation subsidy in this academic year?
transportation_subsidy_mapping = {
    1: 'Yes',
    2: 'No'
}

# Did [Name]'s household buy supplementary textbooks for the previous academic year?
supplementary_textbooks_mapping = {
    1: 'Yes',
    2: 'No'
}

# Where does [Name] mainly purchase the required textbooks?
textbook_purchase_location_mapping = {
    1: 'Bookstore',
    2: 'School',
    3: 'Other (Specify)'
}

# Did the source have the full list of required books by the first week of the previous academic year?
full_textbook_list_availability_mapping = {
    1: 'Yes',
    2: 'No'
}

# How long did it take until all required books were available?
textbook_availability_time_mapping = {
    1: 'Less Than Two Weeks',
    2: 'Two Weeks to 1 Month',
    3: '1 Month to 2 Months',
    4: 'More Than Two Months'
}

# Has [Name] heard of the textbook subsidies program in Albania?
textbook_subsidies_program_awareness_mapping = {
    1: 'Yes',
    2: 'No'
}

# Did [Name] claim the subsidy rebate for school textbooks?
claimed_textbook_subsidy_mapping = {
    1: 'Yes',
    2: 'No'
}

# How long did it take for [Name] to receive the subsidy rebate?
subsidy_rebate_time_mapping = {
    1: 'Less Than 1 Month',
    2: '1 to 3 Months',
    3: 'More Than 3 Months',
    4: 'Still Have Not Received the Subsidy'
}

# Has [Name] received any private tutoring during this academic year?
private_tutoring_mapping = {
    1: 'Yes',
    2: 'No'
}

# Who is tutoring [Name]?
tutor_type_mapping = {
    1: 'Own Teacher',
    2: 'Other Teacher in School',
    3: 'Other Tutor',
    4: 'Friend/Relative'
}

# How often has [Name] been receiving tutoring per month?
tutoring_frequency_mapping = {
    1: 'Daily',
    2: 'Several Times a Week',
    3: 'At Least Once a Week',
    4: 'Once Every 2 Weeks',
    5: 'Once a Month',
    6: 'Have Not Received in the Past Month'
}

# Is [Name] paying for the tutoring?
paying_for_tutoring_mapping = {
    1: 'Yes',
    2: 'No'
}

# Has [Name]'s household provided money or gifts in kind to the school or teachers in the previous academic year?
money_or_gifts_to_school_mapping = {
    1: 'Yes',
    2: 'No'
}

# Has [Name] been absent from school in the past 4 weeks?
absent_from_school_mapping = {
    1: 'Yes',
    2: 'No'
}

# Why did [Name] miss school?
absence_reason_mapping = {
    1: 'Bad Weather',
    2: 'No Interest',
    3: 'Agricultural Work',
    4: 'Other Work',
    5: 'School Too Far',
    6: 'Poor Teaching',
    7: 'Poor Facilities',
    8: 'Own Illness',
    9: 'Family Illness/Death',
    10: 'Moved',
    11: 'Safety',
    12: 'Other (Specify)'
}

# Is [Name] currently receiving a scholarship or subsidy to support education?
receiving_scholarship_mapping = {
    1: 'Yes',
    2: 'No'
}

# How much time per day has [Name] spent on homework over the past 4 weeks?
homework_time_mapping = {
    1: 'None',
    2: 'Less Than 30 Minutes',
    3: '30 Minutes to 1 Hour',
    4: '1-2 Hours',
    5: '2-3 Hours',
    6: '3-4 Hours',
    7: 'More Than 4 Hours'
}

### Let's start with Subjective Poverty first


In [None]:
print(pov_train.shape)

# a very normal like distribution!
ax = sns.barplot(pov_train.iloc[:, 1:].sum(axis=0))
ax.set_xticklabels(list(range(10)))
plt.title('Number of people in each subjective poverty level')
plt.show()

In [None]:
pov_train[['psu', 'hh', 'idcode']] = pov_train['psu_hh_idcode'].str.split('_', expand=True).astype(int)
# plot the distribution of psu, hh, and idcode
fig, ax = plt.subplots(1, 3, figsize=(15, 5))
sns.histplot(pov_train['psu'], ax=ax[0])
ax[0].set_title('Distribution of psu')
sns.countplot(pov_train['hh'].astype(str), ax=ax[1], order = list(range(1, 17)))
ax[1].set_title('Distribution of hh')
sns.countplot(pov_train['idcode'].astype(str), ax=ax[2], order=list(range(1, 9)))
ax[2].set_title('Distribution of idcode')
plt.show()

In [None]:
# we might as well convert it to a range of 1-10
# I also tried with just the classes themselves, but not much difference in the correlation matrix
pov_train['num_pov'] = pov_train.iloc[:, 1:11].idxmax(axis=1).str.split('_').str[-1].astype(int)
# Calculate the correlation matrix
corr = pov_train[['psu', 'hh', 'idcode', 'num_pov']].corr()

# Generate a heatmap
plt.figure(figsize=(4, 3))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix for pov_train')
plt.show()

### Then Education data


In [None]:
# there are a lot of nulls in the data
# we might as well just take the columns that have less than 50% nulls
# since we also have too many features to work with
sns.histplot(edu_train.isnull().sum(axis=0) / edu_train.shape[0])
plt.title('Percentage of nulls in edu_train')
plt.show()

In [None]:
# they look pretty random to me, except for q4 and q6
threshold = 0.2
valid_columns = (edu_train.isnull().sum(axis=0) / edu_train.shape[0] < threshold)
valid_columns = valid_columns[valid_columns].index.tolist()
sns.pairplot(edu_train[valid_columns])