In [1]:
import pandas as pd
import seaborn as sns
import glob
import matplotlib.pyplot as plt
import cv2 
import re

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
imagepath = glob.glob('./AP_crop_data/face_age/*/*.png')

In [4]:
imagepath[0]

'./AP_crop_data\\10_14.jpg'

In [5]:
print('The number of images in AP_crop_data folder = ',len(imagepath))

The number of images in AP_crop_data folder =  6025


In [6]:

df = pd.DataFrame(imagepath, columns=['filepath'])

# Function to extract age from file path
def extract_age(filepath):
    match = re.search(r'./AP_crop_data\\face_age\\(-?\d+)/.*\.png', filepath)
    if match:
        return int(match.group(1))
    return None

# Apply the function to the DataFrame
df['age'] = df['filepath'].apply(extract_age)

print(df)


                         filepath  age
0        ./AP_crop_data\10_14.jpg   10
1        ./AP_crop_data\10_18.jpg   10
2        ./AP_crop_data\10_20.jpg   10
3         ./AP_crop_data\10_6.jpg   10
4       ./AP_crop_data\110_30.jpg  110
...                           ...  ...
6020  ./AP_crop_data\99_16224.jpg   99
6021   ./AP_crop_data\9_16232.jpg    9
6022   ./AP_crop_data\9_16233.jpg    9
6023   ./AP_crop_data\9_16235.jpg    9
6024   ./AP_crop_data\9_16239.jpg    9

[6025 rows x 2 columns]


In [7]:
def get_size(path):
    img = cv2.imread(path)
    return img.shape[0]

In [8]:
df['dimension'] = df['filepath'].apply(get_size) # store dimension of image in this columns

In [9]:
df.head()

Unnamed: 0,filepath,age,dimension
0,./AP_crop_data\10_14.jpg,10,182
1,./AP_crop_data\10_18.jpg,10,122
2,./AP_crop_data\10_20.jpg,10,122
3,./AP_crop_data\10_6.jpg,10,122
4,./AP_crop_data\110_30.jpg,110,273


In [10]:
##dist_expression = df['age'].value_counts()
##dist_expression

In [11]:
##plt.figure(figsize=(12,6))
##plt.subplot(2,1,1)
##sns.histplot(df['dimension'])
##plt.subplot(2,1,2)
##sns.boxplot(df['dimension'])
##plt.show()

In [12]:
df_filter = df.query('dimension > 60')
df_filter.shape

(5930, 3)

In [13]:
df_filter['age'].value_counts(normalize=True)

age
26     0.125126
28     0.057336
25     0.045025
45     0.041484
24     0.039629
         ...   
5      0.000337
110    0.000169
11     0.000169
93     0.000169
111    0.000169
Name: proportion, Length: 89, dtype: float64

In [14]:
df_filter.head()

Unnamed: 0,filepath,age,dimension
0,./AP_crop_data\10_14.jpg,10,182
1,./AP_crop_data\10_18.jpg,10,122
2,./AP_crop_data\10_20.jpg,10,122
3,./AP_crop_data\10_6.jpg,10,122
4,./AP_crop_data\110_30.jpg,110,273


In [15]:
def structuring(path):
    try:

        # step - 1: read image
        img = cv2.imread(path) # BGR
        # step - 2: convert into grayscale
        gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
        # step -3: resize into 100 x 100 array   

        size = gray.shape[0]
        if size >= 100:
            # cv2.INTER_AREA (SHINK)
            gray_resize = cv2.resize(gray,(100,100),cv2.INTER_AREA)
        else:
            # cv2.INTER_CUBIC (ENLARGE)
            gray_resize = cv2.resize(gray,(100,100),cv2.INTER_CUBIC)

        # step -4: Flatten Image (1x10,000)
        flatten_image = gray_resize.flatten()
        return flatten_image
    
    except:
        return None

In [16]:
df_filter['data'] = df_filter['filepath'].apply(structuring) # convert all images into 100 x 100
df_filter.head()

Unnamed: 0,filepath,age,dimension,data
0,./AP_crop_data\10_14.jpg,10,182,"[32, 38, 42, 34, 31, 25, 18, 24, 27, 32, 43, 5..."
1,./AP_crop_data\10_18.jpg,10,122,"[242, 216, 166, 103, 45, 17, 37, 48, 60, 81, 1..."
2,./AP_crop_data\10_20.jpg,10,122,"[71, 89, 106, 123, 113, 87, 79, 78, 79, 81, 81..."
3,./AP_crop_data\10_6.jpg,10,122,"[65, 69, 70, 71, 69, 69, 71, 69, 66, 62, 63, 6..."
4,./AP_crop_data\110_30.jpg,110,273,"[14, 16, 20, 21, 24, 20, 13, 12, 18, 20, 23, 2..."


In [17]:
df_filter

Unnamed: 0,filepath,age,dimension,data
0,./AP_crop_data\10_14.jpg,10,182,"[32, 38, 42, 34, 31, 25, 18, 24, 27, 32, 43, 5..."
1,./AP_crop_data\10_18.jpg,10,122,"[242, 216, 166, 103, 45, 17, 37, 48, 60, 81, 1..."
2,./AP_crop_data\10_20.jpg,10,122,"[71, 89, 106, 123, 113, 87, 79, 78, 79, 81, 81..."
3,./AP_crop_data\10_6.jpg,10,122,"[65, 69, 70, 71, 69, 69, 71, 69, 66, 62, 63, 6..."
4,./AP_crop_data\110_30.jpg,110,273,"[14, 16, 20, 21, 24, 20, 13, 12, 18, 20, 23, 2..."
...,...,...,...,...
6020,./AP_crop_data\99_16224.jpg,99,182,"[123, 127, 129, 129, 133, 144, 153, 166, 174, ..."
6021,./AP_crop_data\9_16232.jpg,9,273,"[43, 46, 51, 57, 63, 69, 76, 80, 80, 83, 86, 8..."
6022,./AP_crop_data\9_16233.jpg,9,273,"[74, 74, 76, 83, 80, 77, 77, 83, 93, 92, 91, 9..."
6023,./AP_crop_data\9_16235.jpg,9,273,"[185, 184, 183, 184, 185, 183, 175, 137, 101, ..."


In [18]:
data = df_filter['data'].apply(pd.Series)
data.columns = [f"pixel_{i}" for i in data.columns]
data.head()

Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,pixel_9990,pixel_9991,pixel_9992,pixel_9993,pixel_9994,pixel_9995,pixel_9996,pixel_9997,pixel_9998,pixel_9999
0,32,38,42,34,31,25,18,24,27,32,...,19,19,20,21,21,21,21,21,21,21
1,242,216,166,103,45,17,37,48,60,81,...,56,55,60,68,75,78,74,71,69,69
2,71,89,106,123,113,87,79,78,79,81,...,61,69,81,99,104,87,72,74,85,94
3,65,69,70,71,69,69,71,69,66,62,...,91,100,99,99,99,101,100,101,104,106
4,14,16,20,21,24,20,13,12,18,20,...,73,89,90,74,61,57,62,70,72,63


In [19]:
data = data/255.0 
data['age'] = df_filter['age']
data.head()

Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,pixel_9991,pixel_9992,pixel_9993,pixel_9994,pixel_9995,pixel_9996,pixel_9997,pixel_9998,pixel_9999,age
0,0.12549,0.14902,0.164706,0.133333,0.121569,0.098039,0.070588,0.094118,0.105882,0.12549,...,0.07451,0.078431,0.082353,0.082353,0.082353,0.082353,0.082353,0.082353,0.082353,10
1,0.94902,0.847059,0.65098,0.403922,0.176471,0.066667,0.145098,0.188235,0.235294,0.317647,...,0.215686,0.235294,0.266667,0.294118,0.305882,0.290196,0.278431,0.270588,0.270588,10
2,0.278431,0.34902,0.415686,0.482353,0.443137,0.341176,0.309804,0.305882,0.309804,0.317647,...,0.270588,0.317647,0.388235,0.407843,0.341176,0.282353,0.290196,0.333333,0.368627,10
3,0.254902,0.270588,0.27451,0.278431,0.270588,0.270588,0.278431,0.270588,0.258824,0.243137,...,0.392157,0.388235,0.388235,0.388235,0.396078,0.392157,0.396078,0.407843,0.415686,10
4,0.054902,0.062745,0.078431,0.082353,0.094118,0.078431,0.05098,0.047059,0.070588,0.078431,...,0.34902,0.352941,0.290196,0.239216,0.223529,0.243137,0.27451,0.282353,0.247059,110


In [20]:
data.isnull().sum().sum()

0

In [21]:
data.dropna(inplace=True)

In [22]:
data.shape

(5930, 10001)

In [23]:
import pickle
pickle.dump(data,open('./AP_data/age_data_images_100_100.pickle',mode='wb'))