# Nationality Detection Model

**Import the Libraries**

In [15]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings("ignore")

import cv2
from PIL import Image

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
print("DONE")


DONE


**Find out the path and read the CSV file**

In [16]:
# list all .csv files and show the full path
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename.endswith('.csv'):
            print(os.path.join(dirname, filename))


/kaggle/input/fairface/fairface/fairface_label_val.csv
/kaggle/input/fairface/fairface/fairface_label_train.csv


In [18]:
# ✅ Correct file paths for train and validation CSVs
train_csv = '/kaggle/input/fairface/fairface/fairface_label_train.csv'
val_csv = '/kaggle/input/fairface/fairface/fairface_label_val.csv'

# Load the data
df_train = pd.read_csv(train_csv)
df_val = pd.read_csv(val_csv)

# Preview the training data
print(df_train.head())


          file    age  gender        race  service_test
0  train/1.jpg  50-59    Male  East Asian          True
1  train/2.jpg  30-39  Female      Indian         False
2  train/3.jpg    3-9  Female       Black         False
3  train/4.jpg  20-29  Female      Indian          True
4  train/5.jpg  20-29  Female      Indian          True


**Ensures the Dataset Healt and Summary**

In [20]:
# Checkinf that how many samples exist for every race label (our target)
print("Race Label Distribution in Training Data:")
print(df_train['race'].value_counts())

# Checking that if any missing values across the columns
print("\n Missing Values in Training Data:")
print(df_train.isnull().sum())

# Also, check that the how many number of samples are present in dataset
print("\n Total Training Samples:", len(df_train))


Race Label Distribution in Training Data:
race
White              16527
Latino_Hispanic    13367
Indian             12319
East Asian         12287
Black              12233
Southeast Asian    10795
Middle Eastern      9216
Name: count, dtype: int64

 Missing Values in Training Data:
file            0
age             0
gender          0
race            0
service_test    0
dtype: int64

 Total Training Samples: 86744


In [22]:
# Remove service test rows which true
df_train = df_train[df_train['service_test'] == False].reset_index(drop=True)
df_val = df_val[df_val['service_test'] == False].reset_index(drop=True)



**Converting the Race Column value in Numeric form for model understanding**

In [23]:
label_encoder = LabelEncoder()
df_train['race_label'] = label_encoder.fit_transform(df_train['race'])
df_val['race_label'] = label_encoder.transform(df_val['race'])

# Show the encoded class mapping
race_classes = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Encoded Race Classes:")
print(race_classes)


Encoded Race Classes:
{'Black': 0, 'East Asian': 1, 'Indian': 2, 'Latino_Hispanic': 3, 'Middle Eastern': 4, 'Southeast Asian': 5, 'White': 6}


Load and Pair Images with Labels

In [28]:
# ✅ Corrected image directories
train_img_dir = "/kaggle/input/fairface/fairface/train"
val_img_dir = "/kaggle/input/fairface/fairface/val"

# ✅ Fix img_path columns accordingly
df_train['img_path'] = df_train['file'].apply(lambda x: os.path.join(train_img_dir, os.path.basename(x)))
df_val['img_path'] = df_val['file'].apply(lambda x: os.path.join(val_img_dir, os.path.basename(x)))

# Double-check
df_train[['file', 'img_path']].head()


Unnamed: 0,file,img_path
0,train/2.jpg,/kaggle/input/fairface/fairface/train/2.jpg
1,train/3.jpg,/kaggle/input/fairface/fairface/train/3.jpg
2,train/7.jpg,/kaggle/input/fairface/fairface/train/7.jpg
3,train/10.jpg,/kaggle/input/fairface/fairface/train/10.jpg
4,train/12.jpg,/kaggle/input/fairface/fairface/train/12.jpg


Converting Categorical value into numeric value

In [29]:
# 🧠 Label Encoding: Convert race (string labels) to numeric labels

label_encoder = LabelEncoder()

# Fit on training set and transform both train and validation sets
df_train['race_encoded'] = label_encoder.fit_transform(df_train['race'])
df_val['race_encoded'] = label_encoder.transform(df_val['race'])  # Use same encoder

# 🖨️ Display mapping and a sample
print("Race Label Mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")

print("\nSample Encoded Labels:")
print(df_train[['race', 'race_encoded']].head())


Race Label Mapping:
0: Black
1: East Asian
2: Indian
3: Latino_Hispanic
4: Middle Eastern
5: Southeast Asian
6: White

Sample Encoded Labels:
             race  race_encoded
0          Indian             2
1           Black             0
2  Middle Eastern             4
3  Middle Eastern             4
4      East Asian             1


In [27]:
import os

for dirname, _, filenames in os.walk("/kaggle/input/fairface"):
    print(dirname)


/kaggle/input/fairface
/kaggle/input/fairface/fairface
/kaggle/input/fairface/fairface/val
/kaggle/input/fairface/fairface/train


In [31]:
import os

# ✅ Show files inside the correct train directory
for filename in os.listdir("/kaggle/input/fairface/fairface/train")[:10]:  # limit to 10
    print(filename)

64601.jpg
31973.jpg
30778.jpg
19812.jpg
22735.jpg
38246.jpg
44504.jpg
16916.jpg
52876.jpg
74055.jpg


In [33]:
# ✅ Corrected image directories
train_img_dir = "/kaggle/input/fairface/fairface/train"
val_img_dir = "/kaggle/input/fairface/fairface/val"

# ✅ Fix img_path columns accordingly
df_train['img_path'] = df_train['file'].apply(lambda x: os.path.join(train_img_dir, os.path.basename(x)))
df_val['img_path'] = df_val['file'].apply(lambda x: os.path.join(val_img_dir, os.path.basename(x)))

# Double-check
df_train[['file', 'img_path']].head()


Unnamed: 0,file,img_path
0,train/2.jpg,/kaggle/input/fairface/fairface/train/2.jpg
1,train/3.jpg,/kaggle/input/fairface/fairface/train/3.jpg
2,train/7.jpg,/kaggle/input/fairface/fairface/train/7.jpg
3,train/10.jpg,/kaggle/input/fairface/fairface/train/10.jpg
4,train/12.jpg,/kaggle/input/fairface/fairface/train/12.jpg


In [34]:
import cv2
import numpy as np
from tensorflow.keras.utils import to_categorical

# 📐 Resize target
IMG_SIZE = 64

# 🧹 Load and preprocess images
def load_data_clean(df, img_size):
    images = []
    labels = []
    skipped = 0

    for i in range(len(df)):
        img_path = df.iloc[i]['img_path']
        label = df.iloc[i]['race_encoded']

        try:
            img = cv2.imread(img_path)
            if img is None:
                skipped += 1
                continue
            img = cv2.resize(img, (img_size, img_size))
            img = img / 255.0  # normalize to [0,1]
            images.append(img)
            labels.append(label)
        except:
            skipped += 1
            continue

    X = np.array(images)
    y = to_categorical(np.array(labels), num_classes=7)

    print(f"✅ Loaded: {len(images)} images | ❌ Skipped: {skipped} images")
    return X, y

# 🧠 Load train & validation data
X_train, y_train = load_data_clean(df_train, IMG_SIZE)
X_val, y_val = load_data_clean(df_val, IMG_SIZE)

# 📊 Show final shapes
print("✅ X_train shape:", X_train.shape)
print("✅ y_train shape:", y_train.shape)
print("✅ X_val shape:", X_val.shape)
print("✅ y_val shape:", y_val.shape)


✅ Loaded: 46492 images | ❌ Skipped: 0 images
✅ Loaded: 5792 images | ❌ Skipped: 0 images
✅ X_train shape: (46492, 64, 64, 3)
✅ y_train shape: (46492, 7)
✅ X_val shape: (5792, 64, 64, 3)
✅ y_val shape: (5792, 7)
