# Data loading

In [3]:
# Train and testing model

import pandas as pd

def load_ndjson_to_df(file_names):
    """
    Load one or more ndjson files into a pandas DataFrame.

    Args:
        file_names (list or str): A list of ndjson file names or a single file name.

    Returns:
        pandas.DataFrame: A DataFrame containing the data from the ndjson file(s).
    """
    try:
        # Create an empty DataFrame to store the combined data
        combined_df = pd.DataFrame()

        # Check if file_names is a list or a single string
        if isinstance(file_names, list):
            files = file_names
        else:
            files = [file_names]

        # Iterate over the list of file names
        for file_name in files:
            # Read the ndjson file into a DataFrame
            df = pd.read_json(file_name, lines=True, orient='records')

            # Concatenate the DataFrame with the combined DataFrame
            combined_df = pd.concat([combined_df, df], ignore_index=True)

        return combined_df

    except FileNotFoundError as e:
        print(f"File '{e.filename}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")


data_files = ["./data/full_simplified_apple.ndjson",
                "./data/full_simplified_banana.ndjson",
                "./data/full_simplified_blueberry.ndjson",
                "./data/full_simplified_watermelon.ndjson",]
print("Loading data...")
df = load_ndjson_to_df(data_files)
print(df);

Loading data...
              word countrycode                        timestamp  recognized  \
0            apple          US 2017-03-10 22:17:57.574660+00:00       False   
1            apple          RU 2017-03-08 06:29:44.162820+00:00        True   
2            apple          GB 2017-03-10 12:41:33.390630+00:00        True   
3            apple          US 2017-03-16 18:01:54.559040+00:00        True   
4            apple          TH 2017-03-29 14:35:17.694720+00:00        True   
...            ...         ...                              ...         ...   
713470  watermelon          TH 2017-03-26 18:10:18.156470+00:00        True   
713471  watermelon          HU 2017-03-05 15:13:55.686980+00:00        True   
713472  watermelon          GB 2017-01-28 19:15:54.067510+00:00        True   
713473  watermelon          CA 2017-03-05 18:01:18.449470+00:00        True   
713474  watermelon          US 2017-03-24 04:16:28.828370+00:00        True   

                  key_id           

# EDA

In [4]:
def clean_ndjson_data(df):
    """
    Clean up the ndjson data by removing the "countrycode", "timestamp", and "key_id" columns.
    
    Args:
        df (pandas.DataFrame): The DataFrame containing the ndjson data.
        
    Returns:
        pandas.DataFrame: A new DataFrame with the specified columns removed.
    """
    # Drop the specified columns
    cleaned_df = df.drop(columns=["countrycode", "timestamp", "key_id"])
    
    return cleaned_df

df = clean_ndjson_data(df)
# One-hot encode "word"
df = pd.get_dummies(df, columns=["word"], prefix=["fruit"])
print(df);

        recognized                                            drawing  \
0            False  [[[255, 255], [0, 0]], [[255, 255], [0, 0]], [...   
1             True  [[[95, 79, 68, 31, 17, 9, 1, 0, 4, 54, 103, 13...   
2             True  [[[121, 107, 45, 17, 1, 0, 4, 21, 58, 118, 173...   
3             True  [[[104, 80, 54, 28, 11, 0, 1, 8, 20, 51, 90, 1...   
4             True  [[[85, 76, 61, 45, 34, 10, 4, 0, 4, 30, 58, 87...   
...            ...                                                ...   
713470        True  [[[10, 6, 1, 1, 43, 55, 64, 86, 113, 157, 173,...   
713471        True  [[[2, 5, 10, 24, 40, 55, 104, 129, 166, 211, 2...   
713472        True  [[[92, 44, 28, 13, 2, 0, 7, 16, 36, 62, 127, 1...   
713473        True  [[[60, 90, 123, 183], [5, 50, 109, 187]], [[48...   
713474        True  [[[241, 244, 235, 216, 157, 126, 77, 50, 14, 1...   

        fruit_apple  fruit_banana  fruit_blueberry  fruit_watermelon  
0              True         False            False  

In [11]:
import pprint as pp
pp.print(df.iloc[1].drawing)

[[[95, 79, 68, 31, 17, 9, 1, 0, 4, 54, 103, 130, 168, 190, 204, 219, 228, 222, 210, 200, 194, 197, 203, 192, 189, 190, 209, 207, 197, 179, 107, 100], [62, 50, 49, 74, 91, 113, 163, 220, 226, 249, 255, 255, 244, 231, 215, 187, 152, 151, 160, 160, 156, 140, 137, 134, 115, 109, 88, 82, 71, 63, 62, 65]], [[100, 100, 104, 110, 115, 115, 108, 107], [58, 17, 1, 0, 12, 22, 44, 64]]]
