# Data loading

In [1]:
# Train and testing model

import pandas as pd

def load_ndjson_to_df(file_names, print_progress=False):
    """
    Load one or more ndjson files into a pandas DataFrame.

    Args:
        file_names (list or str): A list of ndjson file names or a single file name.
        print_progress: if True, prints debug messages that shows the progress

    Returns:
        pandas.DataFrame: A DataFrame containing the data from the ndjson file(s).
    """
    try:
        # Create an empty DataFrame to store the combined data
        combined_df = pd.DataFrame()

        # Check if file_names is a list or a single string
        if isinstance(file_names, list):
            files = file_names
        else:
            files = [file_names]

        # Iterate over the list of file names
        for file_name in files:
            # Read the ndjson file into a DataFrame
            if (print_progress):
                print(f"Reading {file_name}...", end=" ")
            df = pd.read_json(file_name, lines=True, orient='records')

            # Concatenate the DataFrame with the combined DataFrame
            combined_df = pd.concat([combined_df, df], ignore_index=True)
            if (print_progress):
                print("Completed")

        return combined_df

    except FileNotFoundError as e:
        print(f"File '{e.filename}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")


data_files = ["./data/full_simplified_apple.ndjson",
                "./data/full_simplified_banana.ndjson",
                "./data/full_simplified_blueberry.ndjson",
                "./data/full_simplified_watermelon.ndjson",]
print("Loading data...")
df = load_ndjson_to_df(data_files, print_progress=True)
print(df);

Loading data...
Reading ./data/full_simplified_apple.ndjson... Completed
Reading ./data/full_simplified_banana.ndjson... Completed
Reading ./data/full_simplified_blueberry.ndjson... Completed
Reading ./data/full_simplified_watermelon.ndjson... Completed
              word countrycode                        timestamp  recognized  \
0            apple          US 2017-03-10 22:17:57.574660+00:00       False   
1            apple          RU 2017-03-08 06:29:44.162820+00:00        True   
2            apple          GB 2017-03-10 12:41:33.390630+00:00        True   
3            apple          US 2017-03-16 18:01:54.559040+00:00        True   
4            apple          TH 2017-03-29 14:35:17.694720+00:00        True   
...            ...         ...                              ...         ...   
713470  watermelon          TH 2017-03-26 18:10:18.156470+00:00        True   
713471  watermelon          HU 2017-03-05 15:13:55.686980+00:00        True   
713472  watermelon          GB 2017

# EDA

In [3]:
print("Preprocessing data...")

def clean_ndjson_data(df):
    """
    Clean up the ndjson data by removing metadata columns and filtering out unrecognized
    data
    
    Metadata columns removed: "countrycode", "timestamp", and "key_id" columns.
    
    Args:
        df (pandas.DataFrame): The DataFrame containing the ndjson data.
        
    Returns:
        pandas.DataFrame: A new DataFrame with the specified columns removed.
    """
    # Drop the specified columns
    cleaned_df = df[df['recognized'] == True]
    cleaned_df = cleaned_df.drop(columns=["countrycode", "timestamp", "key_id", "recognized"])

    return cleaned_df

cleaned_df = clean_ndjson_data(df)
# One-hot encode "word"
cleaned_df = pd.get_dummies(cleaned_df, columns=["word"], prefix=["fruit"])
print(cleaned_df);

Preprocessing data...
                                                  drawing  fruit_apple  \
1       [[[95, 79, 68, 31, 17, 9, 1, 0, 4, 54, 103, 13...         True   
2       [[[121, 107, 45, 17, 1, 0, 4, 21, 58, 118, 173...         True   
3       [[[104, 80, 54, 28, 11, 0, 1, 8, 20, 51, 90, 1...         True   
4       [[[85, 76, 61, 45, 34, 10, 4, 0, 4, 30, 58, 87...         True   
5       [[[184, 160, 135, 100, 50, 14, 4, 0, 27, 50, 7...         True   
...                                                   ...          ...   
713470  [[[10, 6, 1, 1, 43, 55, 64, 86, 113, 157, 173,...        False   
713471  [[[2, 5, 10, 24, 40, 55, 104, 129, 166, 211, 2...        False   
713472  [[[92, 44, 28, 13, 2, 0, 7, 16, 36, 62, 127, 1...        False   
713473  [[[60, 90, 123, 183], [5, 50, 109, 187]], [[48...        False   
713474  [[[241, 244, 235, 216, 157, 126, 77, 50, 14, 1...        False   

        fruit_banana  fruit_blueberry  fruit_watermelon  
1              False           

# Train model

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Load dataset
digits = load_digits()
X, y = digits.data, digits.target

# Preprocess data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define create_model function for KerasClassifier
def create_model(num_layers=1, num_neurons=64, activation='relu', dropout_rate=0.0, momentum=0.9):
    model = Sequential()
    model.add(Dense(num_neurons, input_dim=X_scaled.shape[1], activation=activation))
    model.add(Dropout(dropout_rate))
    for _ in range(num_layers - 1):
        model.add(Dense(num_neurons, activation=activation))
        model.add(Dropout(dropout_rate))
    model.add(Dense(10, activation='softmax'))
    optimizer = SGD(learning_rate=0.01, momentum=momentum)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Define parameters grid for grid search
param_grid = {
    'num_layers': [3],
    'num_neurons': [32, 64],
    'activation': ['relu', 'tanh'],
    'dropout_rate': [0.2, 0.5],
    'momentum': [0.5, 0.9]
}

# Create KerasClassifier wrapper for scikit-learn
model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=32)

# Perform grid search with cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=kfold, scoring='accuracy')
grid_result = grid_search.fit(X_scaled, y)

# Print results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, std, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, std, param))