In [None]:
"""
Huge credit to Ashish Jangra for collecting and curating this dataset, available at:
https://www.kaggle.com/datasets/ashishjangra27/doodle-dataset
Shoutout to Maria Saif for providing the framework for the CNN model:
https://www.kaggle.com/code/mariasaif/building-and-training-a-cnn-for-doodle-classificat
"""

import kagglehub

# Download latest version
path = kagglehub.dataset_download("ashishjangra27/doodle-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/dori/.cache/kagglehub/datasets/ashishjangra27/doodle-dataset/versions/1


In [8]:
import os
print("Files in dataset directory:", os.listdir(path))

Files in dataset directory: ['doodle', 'master_doodle_dataframe.csv']


In [9]:
import pandas as pd
df = pd.read_csv(path + "/master_doodle_dataframe.csv")
print(df.head())

  countrycode                                            drawing  \
0          RO  [[[0, 5, 3, 4, 10, 72, 82, 91, 99, 98, 87, 71,...   
1          US  [[[0, 255], [3, 4]], [[86, 73, 73, 76, 85, 130...   
2          ZA  [[[39, 33, 34, 39, 52, 61, 66, 65, 67, 71], [2...   
3          VN  [[[12, 9], [128, 255]], [[0, 23, 36, 37, 42, 5...   
4          HU  [[[9, 7, 0, 19, 49, 90, 96, 99, 93, 94, 62, 16...   

             key_id  recognized           word  \
0  5613582005829632        True  traffic light   
1  5769631006457856        True  traffic light   
2  4999795544424448        True  traffic light   
3  4878417906368512        True  traffic light   
4  5572841187573760        True  traffic light   

                                image_path  
0  data/traffic light/5613582005829632.png  
1  data/traffic light/5769631006457856.png  
2  data/traffic light/4999795544424448.png  
3  data/traffic light/4878417906368512.png  
4  data/traffic light/5572841187573760.png  


In [None]:
# Check the shape of the dataframe
print("Shape of the dataframe:", df.shape)

# Display basic statistics of the dataset
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Check data types of each column
print(df.dtypes)

# Get unique values in 'word' column
unique_words = df['word'].unique()
print("Unique words (classes):", unique_words)
print("Number of unique words:", len(unique_words))




Shape of the dataframe: (1020000, 6)
             key_id
count  1.020000e+06
mean   5.630078e+15
std    6.500708e+14
min    4.503606e+15
25%    5.066571e+15
50%    5.630318e+15
75%    6.192766e+15
max    6.755396e+15
countrycode    46
drawing         0
key_id          0
recognized      0
word            0
image_path      0
dtype: int64
countrycode    object
drawing        object
key_id          int64
recognized       bool
word           object
image_path     object
dtype: object
Unique words (classes): ['traffic light' 'rhinoceros' 'bed' 'school bus' 'van' 'octopus'
 'firetruck' 'laptop' 'tractor' 'matches' 'windmill' 'square' 'pineapple'
 'candle' 'mosquito' 'pear' 'boomerang' 'lollipop' 'yoga' 'waterslide'
 'swan' 'triangle' 'diving board' 'crayon' 'hockey puck' 'moustache'
 'calendar' 'cow' 'fire hydrant' 'hot air balloon' 'helmet' 'parrot'
 'hot tub' 'baseball' 'saw' 'mouth' 'passport' 'campfire' 'car'
 'bulldozer' 'pencil' 'wine glass' 'marker' 'axe' 'mug' 'see saw' 'foot'
 'door'

In [16]:
df_sample = df.sample(n=50000, random_state=42)  # Sample 50,000 drawings for initial training


In [19]:
import numpy as np
import cv2
import ast
from tqdm import tqdm  # Import tqdm

# Function to convert stroke-based drawings to images
def strokes_to_image(strokes, size=(64, 64)):
    img = np.zeros(size, dtype=np.uint8)
    for stroke in strokes:
        for i in range(len(stroke[0]) - 1):
            x1, y1 = stroke[0][i], stroke[1][i]
            x2, y2 = stroke[0][i + 1], stroke[1][i + 1]
            img = cv2.line(img, (x1, y1), (x2, y2), 255, 2)
    return img

# Batch processing to convert all drawings
def convert_drawings_to_images(dataframe):
    images = []
    for drawing in tqdm(dataframe['drawing'], desc='Converting drawings to images'):
        strokes = ast.literal_eval(drawing)
        img = strokes_to_image(strokes, size=(64, 64))
        images.append(img)
    return np.array(images)

# Convert drawings to images using the sampled dataframe
df_sample['drawing_image'] = convert_drawings_to_images(df_sample)


Converting drawings to images: 100%|██████████| 50000/50000 [00:17<00:00, 2862.34it/s]


In [None]:
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder
from keras.utils import to_categorical  # Import to_categorical
# Convert images to numpy arrays and normalize
X = np.stack(df_sample['drawing_image'].values).astype('float32') / 255.0
X = X.reshape(-1, 64, 64, 1)  # Reshape to add channel dimension

# Encode labels using the sampled dataframe
le = LabelEncoder()
y = le.fit_transform(df_sample['word'])
y = to_categorical(y)  # Convert labels to one-hot encoding

# Check shapes
print(f"Shape of X: {X.shape}")  # Should be (50000, 64, 64, 1)
print(f"Shape of y: {y.shape}")  # Should be (50000, number_of_classes)

ModuleNotFoundError: No module named 'tensorflow'