In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
df_train = pd.read_csv("/kaggle/input/iitg-ai-recruitment-2025-beyond-the-box/atlantis_citizens_final.csv")
df_test = pd.read_csv("/kaggle/input/iitg-ai-recruitment-2025-beyond-the-box/test_atlantis_hidden.csv")
df_train["to_train"] = 1
df_test["to_train"] = 0

In [3]:
df_train.isna().sum()

Citizen_ID             0
Diet_Type              0
District_Name          0
Occupation             0
Wealth_Index        1055
House_Size_sq_ft    1197
Life_Expectancy      614
Vehicle_Owned          0
Work_District          0
Bio_Hash               0
to_train               0
dtype: int64

To fill missing values, I grouped the dataset by district name and vehicle owned since they had higher correlations with wealth index during EDA and then filled the values with the median of the respective groups

In [4]:
def impute_strat(train_df, test_df):
    cols = ["Wealth_Index", "Life_Expectancy", "House_Size_sq_ft"]
    
    combined = pd.concat([train_df, test_df], axis=0)
    
    for col in cols:
        combined[col] = combined[col].fillna(
            combined.groupby(["District_Name", "Vehicle_Owned"])[col].transform("median")
        )
        
        combined[col] = combined[col].fillna(
            combined.groupby("District_Name")[col].transform("median")
        )
        
        combined[col] = combined[col].fillna(combined[col].median())

    train_df = combined[combined["to_train"] == 1]
    test_df = combined[combined["to_train"] == 0]
        
    return train_df, test_df

df_train, df_test = impute_strat(df_train, df_test)
df = pd.concat([df_train, df_test])
citizen_id = df_test["Citizen_ID"]
df = df.drop(columns=["Citizen_ID", "Bio_Hash"])

I dropped Bio Hash since it didn't have any patterns that could help in predicting occupation.

Label encoding all the categorical columns. This worked better than using target encoding or one hot encoding.

In [5]:
mapping = {}
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].astype("category")
        df[col+"_code"] = df[col].cat.codes
        mapping[col] = dict(enumerate(df[col].cat.categories))
mapping

{'Diet_Type': {0: 'Exotic Imports', 1: 'Seafood', 2: 'Seaweed'},
 'District_Name': {0: 'Coral Slums',
  1: 'Deep Trench',
  2: 'Mariana Plaza',
  3: 'The Golden Reef'},
 'Occupation': {0: 'Fisher',
  1: 'Merchant',
  2: 'Miner',
  3: 'Scribe',
  4: 'Warrior'},
 'Vehicle_Owned': {0: 'Fin Bicycle',
  1: 'No Vehicle',
  2: 'Royal Submarine',
  3: 'Sea Scooter',
  4: 'Submarine'},
 'Work_District': {0: 'Coral Slums',
  1: 'Deep Trench',
  2: 'Mariana Plaza',
  3: 'The Golden Reef'}}

In [6]:
df_train = df[df["to_train"] == 1].copy()
df_test = df[df["to_train"] == 0].copy()
df_train = df_train.drop(columns = ["to_train"])
df_test = df_test.drop(columns = ["to_train", "Occupation"])

In [None]:
x_train = df_train.drop(columns = ["Occupation", "Occupation_code"])
mapping = {"Warrior":0, "Merchant":1, "Fisher":2, "Miner":3, "Scribe":4}
y_train = df_train["Occupation"].map(mapping)
x_test = df_test.drop(columns = ["Occupation_code"])

Adding the feature of 

In [None]:
x_train["Wealth_Per_Age"] = x_train["Wealth_Index"] / (x_train["Life_Expectancy"] + 1)
x_test["Wealth_Per_Age"] = x_test["Wealth_Index"] / (x_test["Life_Expectancy"] + 1)

In [8]:
x_train.columns

Index(['Diet_Type', 'District_Name', 'Wealth_Index', 'House_Size_sq_ft',
       'Life_Expectancy', 'Vehicle_Owned', 'Work_District', 'Diet_Type_code',
       'District_Name_code', 'Vehicle_Owned_code', 'Work_District_code',
       'Wealth_Per_Age'],
      dtype='object')

Here I am using a logistic regression model to make 5 new columns corresponding to each occupation. Each column would have the probability of that row being that particular occupation which is predicted by this model. These would then be used as features to train the final model.

In [9]:
x_train_temp = x_train.drop(columns = ['Diet_Type_code','District_Name_code', 'Vehicle_Owned_code', 'Work_District_code'])
linear_model = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', StandardScaler(), ['Wealth_Index', 'House_Size_sq_ft', 'Life_Expectancy']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ["Diet_Type", 'District_Name', 'Vehicle_Owned', "Work_District"])
    ])),
    ('classifier', LogisticRegression(max_iter=1000))
])
train_probs = cross_val_predict(linear_model, x_train_temp, y_train, cv=5, method='predict_proba')

linear_model.fit(x_train_temp, y_train)
test_probs = linear_model.predict_proba(x_test)

for i in range(5):
    x_train[f'Linear_Prob_{i}'] = train_probs[:, i]
    x_test[f'Linear_Prob_{i}'] = test_probs[:, i]

In [10]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3938 entries, 0 to 3937
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Diet_Type           3938 non-null   category
 1   District_Name       3938 non-null   category
 2   Wealth_Index        3938 non-null   float64 
 3   House_Size_sq_ft    3938 non-null   float64 
 4   Life_Expectancy     3938 non-null   float64 
 5   Vehicle_Owned       3938 non-null   category
 6   Work_District       3938 non-null   category
 7   Diet_Type_code      3938 non-null   int8    
 8   District_Name_code  3938 non-null   int8    
 9   Vehicle_Owned_code  3938 non-null   int8    
 10  Work_District_code  3938 non-null   int8    
 11  Wealth_Per_Age      3938 non-null   float64 
 12  Linear_Prob_0       3938 non-null   float64 
 13  Linear_Prob_1       3938 non-null   float64 
 14  Linear_Prob_2       3938 non-null   float64 
 15  Linear_Prob_3       3938 non-null   float64

Scaling the numerical columns and preparing the dataframe. Missing values were handled earlier.

In [11]:
numeric_x = x_train.select_dtypes(include = "number")
numeric_features = ["Wealth_Index", "House_Size_sq_ft", "Life_Expectancy", "Wealth_Per_Age"]
others = [x for x in numeric_x if x not in numeric_features]

scaler = StandardScaler()
train_scaled = scaler.fit_transform(x_train[numeric_features])
test_scaled = scaler.transform(x_test[numeric_features])

train_df = pd.DataFrame(train_scaled, columns = numeric_features, index = x_train.index)
test_df = pd.DataFrame(test_scaled, columns = numeric_features, index = x_test.index)

numeric_x_train = pd.concat([train_df, x_train[others]], axis = 1)
numeric_x_test = pd.concat([test_df, x_test[others]], axis=1)

I am using the Logistic Regression model for the final model as well as it had better scores compared to random forest or xgboost.

In [None]:
model = LogisticRegression(
    max_iter=2000,         
    solver='lbfgs',    
    C=1.0,                 
    random_state=42,
    n_jobs=-1       
)

#scores = cross_val_score(model, numeric_x_train, y_train, cv=5, scoring='f1_macro', n_jobs=-1)
#print("Cross-validation scores:", scores)
#print("Mean accuracy:", np.mean(scores))


In [13]:
model.fit(numeric_x_train, y_train)
y_pred = model.predict(numeric_x_test)
y_pred = pd.Series(y_pred)

In [14]:
df_pred = pd.DataFrame({"Citizen_ID" : citizen_id, "Occupation" : y_pred})
df_pred.to_csv("/kaggle/working/submission.csv", index = False)

In [15]:
df_pred

Unnamed: 0,Citizen_ID,Occupation
0,CIT_15383,3
1,CIT_14830,4
2,CIT_17388,0
3,CIT_17438,0
4,CIT_16735,2
...,...,...
3933,CIT_15659,4
3934,CIT_16061,2
3935,CIT_17913,1
3936,CIT_17666,3
