In [1]:
import pandas as pd
import seaborn as sns
import glob
import matplotlib.pyplot as plt
import cv2 
import re

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
imagepath = glob.glob('./AP_crop_data/*.jpg')

In [4]:
imagepath[0]

'./AP_crop_data\\10_2.jpg'

In [5]:
print('The number of images in AP_crop_data folder = ',len(imagepath))

The number of images in AP_crop_data folder =  2405


In [7]:

df = pd.DataFrame(imagepath, columns=['filepath'])

# Function to extract age from file path
def extract_age(filepath):
    match = re.search(r'./AP_crop_data\\(-?\d+)_.*\.jpg', filepath)
    if match:
        return int(match.group(1))
    return None

# Apply the function to the DataFrame
df['age'] = df['filepath'].apply(extract_age)

print(df)


                        filepath  age
0        ./AP_crop_data\10_2.jpg   10
1        ./AP_crop_data\10_4.jpg   10
2       ./AP_crop_data\111_7.jpg  111
3      ./AP_crop_data\116_10.jpg  116
4       ./AP_crop_data\11_12.jpg   11
...                          ...  ...
2400  ./AP_crop_data\90_3237.jpg   90
2401  ./AP_crop_data\99_3244.jpg   99
2402  ./AP_crop_data\99_3246.jpg   99
2403   ./AP_crop_data\9_3248.jpg    9
2404   ./AP_crop_data\9_3249.jpg    9

[2405 rows x 2 columns]


In [8]:
def get_size(path):
    img = cv2.imread(path)
    return img.shape[0]

In [9]:
df['dimension'] = df['filepath'].apply(get_size) # store dimension of image in this columns

In [10]:
df.head()

Unnamed: 0,filepath,age,dimension
0,./AP_crop_data\10_2.jpg,10,182
1,./AP_crop_data\10_4.jpg,10,122
2,./AP_crop_data\111_7.jpg,111,81
3,./AP_crop_data\116_10.jpg,116,81
4,./AP_crop_data\11_12.jpg,11,182


In [12]:
##dist_expression = df['age'].value_counts()
##dist_expression

In [11]:
##plt.figure(figsize=(12,6))
##plt.subplot(2,1,1)
##sns.histplot(df['dimension'])
##plt.subplot(2,1,2)
##sns.boxplot(df['dimension'])
##plt.show()

In [13]:
df_filter = df.query('dimension > 60')
df_filter.shape

(2382, 3)

In [14]:
df_filter['age'].value_counts(normalize=True)

age
26     0.111251
28     0.055416
45     0.041562
25     0.041562
34     0.038203
         ...   
111    0.000420
1      0.000420
8      0.000420
11     0.000420
116    0.000420
Name: proportion, Length: 88, dtype: float64

In [15]:
df_filter.head()

Unnamed: 0,filepath,age,dimension
0,./AP_crop_data\10_2.jpg,10,182
1,./AP_crop_data\10_4.jpg,10,122
2,./AP_crop_data\111_7.jpg,111,81
3,./AP_crop_data\116_10.jpg,116,81
4,./AP_crop_data\11_12.jpg,11,182


In [17]:
def structuring(path):
    try:

        # step - 1: read image
        img = cv2.imread(path) # BGR
        # step - 2: convert into grayscale
        gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
        # step -3: resize into 100 x 100 array   

        size = gray.shape[0]
        if size >= 100:
            # cv2.INTER_AREA (SHINK)
            gray_resize = cv2.resize(gray,(100,100),cv2.INTER_AREA)
        else:
            # cv2.INTER_CUBIC (ENLARGE)
            gray_resize = cv2.resize(gray,(100,100),cv2.INTER_CUBIC)

        # step -4: Flatten Image (1x10,000)
        flatten_image = gray_resize.flatten()
        return flatten_image
    
    except:
        return None

In [18]:
df_filter['data'] = df_filter['filepath'].apply(structuring) # convert all images into 100 x 100
df_filter.head()

Unnamed: 0,filepath,age,dimension,data
0,./AP_crop_data\10_2.jpg,10,182,"[29, 31, 23, 25, 51, 71, 72, 60, 60, 68, 91, 9..."
1,./AP_crop_data\10_4.jpg,10,122,"[241, 240, 239, 240, 238, 234, 231, 227, 226, ..."
2,./AP_crop_data\111_7.jpg,111,81,"[80, 79, 78, 77, 77, 77, 78, 80, 82, 84, 85, 8..."
3,./AP_crop_data\116_10.jpg,116,81,"[113, 114, 115, 117, 120, 123, 124, 124, 124, ..."
4,./AP_crop_data\11_12.jpg,11,182,"[16, 13, 13, 16, 19, 17, 22, 28, 40, 71, 98, 1..."


In [19]:
df_filter

Unnamed: 0,filepath,age,dimension,data
0,./AP_crop_data\10_2.jpg,10,182,"[29, 31, 23, 25, 51, 71, 72, 60, 60, 68, 91, 9..."
1,./AP_crop_data\10_4.jpg,10,122,"[241, 240, 239, 240, 238, 234, 231, 227, 226, ..."
2,./AP_crop_data\111_7.jpg,111,81,"[80, 79, 78, 77, 77, 77, 78, 80, 82, 84, 85, 8..."
3,./AP_crop_data\116_10.jpg,116,81,"[113, 114, 115, 117, 120, 123, 124, 124, 124, ..."
4,./AP_crop_data\11_12.jpg,11,182,"[16, 13, 13, 16, 19, 17, 22, 28, 40, 71, 98, 1..."
...,...,...,...,...
2400,./AP_crop_data\90_3237.jpg,90,182,"[71, 65, 62, 57, 54, 54, 55, 60, 60, 59, 60, 5..."
2401,./AP_crop_data\99_3244.jpg,99,273,"[241, 234, 236, 238, 238, 237, 237, 235, 233, ..."
2402,./AP_crop_data\99_3246.jpg,99,182,"[104, 107, 108, 112, 118, 120, 120, 118, 111, ..."
2403,./AP_crop_data\9_3248.jpg,9,273,"[46, 46, 47, 52, 56, 63, 70, 76, 81, 84, 83, 8..."


In [20]:
data = df_filter['data'].apply(pd.Series)
data.columns = [f"pixel_{i}" for i in data.columns]
data.head()

Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,pixel_9990,pixel_9991,pixel_9992,pixel_9993,pixel_9994,pixel_9995,pixel_9996,pixel_9997,pixel_9998,pixel_9999
0,29,31,23,25,51,71,72,60,60,68,...,23,23,22,22,21,21,21,21,22,23
1,241,240,239,240,238,234,231,227,226,187,...,83,93,108,92,47,42,33,79,105,120
2,80,79,78,77,77,77,78,80,82,84,...,3,3,3,3,4,4,4,5,5,5
3,113,114,115,117,120,123,124,124,124,124,...,97,96,94,93,94,95,94,93,91,91
4,16,13,13,16,19,17,22,28,40,71,...,181,191,194,187,186,183,183,181,181,184


In [21]:
data = data/255.0 
data['age'] = df_filter['age']
data.head()

Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,pixel_9991,pixel_9992,pixel_9993,pixel_9994,pixel_9995,pixel_9996,pixel_9997,pixel_9998,pixel_9999,age
0,0.113725,0.121569,0.090196,0.098039,0.2,0.278431,0.282353,0.235294,0.235294,0.266667,...,0.090196,0.086275,0.086275,0.082353,0.082353,0.082353,0.082353,0.086275,0.090196,10
1,0.945098,0.941176,0.937255,0.941176,0.933333,0.917647,0.905882,0.890196,0.886275,0.733333,...,0.364706,0.423529,0.360784,0.184314,0.164706,0.129412,0.309804,0.411765,0.470588,10
2,0.313725,0.309804,0.305882,0.301961,0.301961,0.301961,0.305882,0.313725,0.321569,0.329412,...,0.011765,0.011765,0.011765,0.015686,0.015686,0.015686,0.019608,0.019608,0.019608,111
3,0.443137,0.447059,0.45098,0.458824,0.470588,0.482353,0.486275,0.486275,0.486275,0.486275,...,0.376471,0.368627,0.364706,0.368627,0.372549,0.368627,0.364706,0.356863,0.356863,116
4,0.062745,0.05098,0.05098,0.062745,0.07451,0.066667,0.086275,0.109804,0.156863,0.278431,...,0.74902,0.760784,0.733333,0.729412,0.717647,0.717647,0.709804,0.709804,0.721569,11


In [23]:
data.isnull().sum().sum()

0

In [24]:
data.dropna(inplace=True)

In [25]:
data.shape

(2382, 10001)

In [26]:
import pickle
pickle.dump(data,open('./AP_data/age_data_images_100_100.pickle',mode='wb'))