In [None]:
import os
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
os.environ["NAMESPACE"]="research"
os.environ["PROFILE"]="local"
from agrobrain_util.runtime.evironment import RuntimeEnv
from agrobrain_util.infra.app_config import application_config as cfg
import matplotlib.pyplot as plt


env = RuntimeEnv()
categories_dict = cfg['tags']['categories']

DATA_DIR = "C:/Users/Anafa/data/agrobrain_wide_images_exploration"

# Create images dataframe

In [None]:
# READ ORDERS CSV FROM JIRA DATA
# orders_csv_2022_path = '/mnt/disks/datasets/wide_images/us_2022_emergence_analysis_jira.csv'
orders_csv_2022_path = os.path.join(DATA_DIR, "us_2022_emergence_analysis_jira.csv")

orders_df = pd.read_csv(orders_csv_2022_path)
orders_df = orders_df.dropna(subset=['Order ID'])
orders_df['Order ID'] = orders_df['Order ID'].astype(int)
orders_list = list(orders_df['Order ID'])

In [None]:
len(orders_list)

In [None]:
# CREATE SUB DF FOR EXPERIMENTS
orders_df = orders_df.head(20)
orders_list = list(orders_df['Order ID'])

In [None]:
# ADD IMAGES DATA BY ORDER ID FROM ETI

example_images_df = env.eti_api.get_images_data_by_orderid(orders_list[0])['images']
images_df = pd.DataFrame(columns=example_images_df[0].keys())

# folder_dir = '/mnt/disks/datasets/wide_images/images_df_folder_1'
folder_dir = os.path.join(DATA_DIR, "images_df_folder_1")

os.makedirs(folder_dir, exist_ok=True)
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=FutureWarning)
    for i, order in enumerate(tqdm(orders_list)):
        order_df = pd.DataFrame(env.eti_api.get_images_data_by_orderid(order)['images'])
        images_df = pd.concat([images_df, order_df], axis='rows', ignore_index=True)
        if len(images_df) > 500:
            # images_df.to_csv(f"/mnt/disks/datasets/wide_images/images_df_folder_1/images_df_{i}.csv")
            images_df.to_csv(os.path.join(folder_dir, f"images_df_{i}.csv"))

            images_df = pd.DataFrame(columns=example_images_df[0].keys())
    images_df.to_csv(os.path.join(folder_dir, f"images_df_{i}.csv"))

In [None]:
# CONCAT SMALL DATAFRAMES (SAVED IN .PY?) TO ONE IMAGE_DF AND SAVE TO CSV
# folders_path = '/mnt/disks/datasets/wide_images/images_df_folder_1'
folders_path = os.path.join(DATA_DIR, "images_df_folder_1")

weeds_images_df = pd.read_csv(os.path.join(ROOT_DIR, "weeds_images_df.csv"))

csv_list = glob.glob(os.path.join(folders_path, "*.csv"))
im_df = pd.DataFrame()
for csv in tqdm(csv_list):
    df = pd.read_csv(csv)
    im_df = pd.concat([im_df, df], axis='rows', ignore_index=True)
# im_df.to_csv('/mnt/disks/datasets/wide_images/images_df.csv')
im_df.to_csv(os.path.join(DATA_DIR, "images_df.csv"))



# Read images dataframe, add features and save

In [None]:
images_df = pd.read_csv(os.path.join(DATA_DIR, "images_df.csv"))

# images_df = pd.read_csv('/mnt/disks/datasets/wide_images/images_df.csv')


In [None]:
len(images_df)

In [None]:
images_df.columns

In [None]:
images_df = images_df.head(20)

In [None]:
# ADD "NUM WEED TAGS" TO IMAGES_DF AND SAVE
images_df["num_weed_tags"] = None
for i, im_id in tqdm(enumerate(images_df["imageID"])):
    image_stats_list = eval(images_df['stats'][i])
    image_weed_tags_count = 0
    for cat in image_stats_list:
        if cat['category'] == categories_dict['weed']:
            image_weed_tags_count += cat['tagsCount']
    images_df.loc[images_df['imageID'] == im_id, "num_weed_tags"] = image_weed_tags_count
print("saving csv")
images_df.to_csv(os.path.join(DATA_DIR, "images_df_new.csv"))
# images_df.to_csv("/mnt/disks/datasets/wide_images/images_df_new.csv")

# Checkups

In [None]:

full_im_df = pd.read_csv(os.path.join(DATA_DIR, "images_df_new.csv"))
# full_im_df = pd.read_csv("/mnt/disks/datasets/wide_images/images_df_new.csv")

In [None]:
len(np.unique(orders_list))

In [None]:
# CHECK WHAT ORDERS ARE IN "ORDERS_LIST" AND NOT IN "IMAGE_DF"

print(f"There are {len(np.unique(full_im_df['orderID']))} orders in im_df and {len(orders_list)} orders in orders_list")
orders_list_from_im_df = np.unique(full_im_df['orderID'])
not_in_im_df = set(orders_list) - set(orders_list_from_im_df)
not_in_im_df = list(not_in_im_df)

In [None]:
orders_df[orders_df['Too Early'].isna()]

In [None]:
orders_df[orders_df['Order ID'].isin(not_in_im_df)]['# Of Images']

In [None]:
orders_df[orders_df['Order ID'].isin(not_in_im_df)].columns

# Experiments - delete after

In [None]:
# GET WEEDS CATEGOTY ID
categories_hierarchy = env.eti_api.get_categories_hierarchy()
for i in range(len(categories_hierarchy)):
    print(f"index {i}, category: {categories_hierarchy[i]['id']}")

# GET LIST OF SUBCATEGORIES IDS
weeds_subcategories_ids = [categories_hierarchy[9]['subCategories'][i]['id'] for i in range(len(categories_hierarchy[9]['subCategories']))]
weeds_subcategories_ids

In [None]:
categories_hierarchy[9]['subCategories']

In [None]:
images_df = full_im_df

In [None]:
eval(images_df['stats'][6])[0]['category'] == categories_dict['weed']

In [None]:
len(eval(images_df['stats'][19]))

In [None]:
type(categories_dict['weed'])

In [None]:
len(images_df)

In [None]:
images_df = images_df.head(100)

In [None]:
# CREATE COLUMNS: "WEED_TYPES_IDS_LIST", "AREAPERCENTAGE", "INFERENCETAGSCOUNT" AND SAVE IMAGES_DATAFRAME

images_df["weed_types_ids_list"] = None
images_df["weed_areaPercentage"] = None
images_df["weed_inferenceTagsCount"] = None


for i, im_id in enumerate(tqdm(images_df["imageID"])):
    image_stats_list = eval(images_df.loc[i, 'stats'])
    weed_types_ids_list = []
    areaPercentage = []
    inferenceTagsCount = []
    for cat in image_stats_list:
        if cat['category'] == categories_dict['weed']:
            weed_types_ids_list.append(cat['subCategory'])
            areaPercentage.append(cat['areaPercentage'])
            inferenceTagsCount.append(cat['inferenceTagsCount'])
    images_df.loc[i, "weed_types_ids_list"] = str(weed_types_ids_list)
    images_df.loc[i, "weed_areaPercentage"] = str(areaPercentage)
    images_df.loc[i, "weed_inferenceTagsCount"] = str(inferenceTagsCount)

# FILTER DATAFRAME - TAKE ONLY IMAGES THAT HAVE WEEDS (AT LEAST ONE WITH CATEGORY 5)
weeds_images_df = images_df[images_df['weed_types_ids_list'].apply(lambda x: len(eval(x)) > 0)].reset_index(drop=True)
# weeds_images_df.to_csv("/mnt/disks/datasets/wide_images/weeds_images_df.csv")
weeds_images_df.to_csv(os.path.join(DATA_DIR, "weeds_images_df.csv"))

weeds_images_df['weed_types_ids_list'] = weeds_images_df['weed_types_ids_list'].apply(eval)


In [None]:
# READ WEEDS IMAGES DF
# weeds_images_df = pd.read_csv("/mnt/disks/datasets/wide_images/weeds_images_df.csv")
weeds_images_df = pd.read_csv(os.path.join(DATA_DIR, "weeds_images_df.csv"))
weeds_images_df['weed_types_ids_list'] = weeds_images_df['weed_types_ids_list'].apply(eval)

In [None]:
def create_cat_dict(cat_list):
    cat_dict_types = {}
    cat_dict_names = {}
    for l in cat_list:
        cat_dict_types[l['id']] = l['type']
        if 'name' in l:
            cat_dict_names[l['id']] = l['name']
        else:
            cat_dict_names[l['id']] = 'NoName'
    return cat_dict_types, cat_dict_names
cat_dict_types, cat_dict_names = create_cat_dict(categories_hierarchy[9]['subCategories'])

unique_weed_types, count_unique_weed_types = np.unique(weeds_images_df['weed_types_ids_list'].explode(), return_counts=True)
top_weed_types = unique_weed_types[np.argsort(count_unique_weed_types)[-5:]]
top_weed_types_names = [cat_dict_names[id] for id in top_weed_types]
# print(top_weed_types_names)

In [None]:
unique_weed_types

In [None]:
count_unique_weed_types

In [None]:
unique_weed_types[np.argsort(count_unique_weed_types)[-3:]]

In [None]:
bar_width = 50
plt.bar(unique_weed_types, count_unique_weed_types, width=bar_width)
plt.xlabel('Unique Values')
plt.ylabel('Counts')
plt.show()