---
# Imports

In [1]:
import os
import pandas as pd
from tqdm import tqdm

---
# Setup

In [2]:
DATA_PATH = os.path.join('..', 'data')

assert os.path.exists(DATA_PATH), f"Data path {DATA_PATH} does not exist."

---
# Data Import

In [3]:
ANNOTATION_PATH = os.path.join(DATA_PATH, 'annot.csv')
annotation_df = pd.read_csv(ANNOTATION_PATH)

In [4]:
annotation_df

Unnamed: 0.1,Unnamed: 0,id,image_id,bbox,utf8_string,points,area
0,0,a4ea732cd3d5948a_1,a4ea732cd3d5948a,"[525.83, 3.4, 197.64, 33.94]",Performance,"[525.83, 3.4, 723.47, 7.29, 722.76, 36.99, 525...",6707.90
1,1,a4ea732cd3d5948a_2,a4ea732cd3d5948a,"[534.67, 64.68, 91.22, 38.19]",Sport,"[535.73, 64.68, 623.41, 67.51, 625.89, 102.87,...",3483.69
2,2,a4ea732cd3d5948a_3,a4ea732cd3d5948a,"[626.95, 63.62, 96.52, 31.82]",Watch,"[626.95, 63.62, 721.7, 63.62, 723.47, 95.44, 6...",3071.27
3,3,a4ea732cd3d5948a_4,a4ea732cd3d5948a,"[577.4, 141.87, 147.13, 43.1]",...period.,"[580.02, 143.61, 724.53, 141.87, 723.66, 184.9...",6341.30
4,4,a4ea732cd3d5948a_5,a4ea732cd3d5948a,"[391.03, 163.9, 60.82, 38.65]",.,"[395.2, 163.9, 451.85, 191.94, 445.59, 202.55,...",2350.69
...,...,...,...,...,...,...,...
1052349,1052349,0ebbecdc46b78d42_15,0ebbecdc46b78d42,"[267.47, -0.14, 28.18, 27.47]",.,"[295.65, -0.14, 295.65, 27.33, 267.47, 27.03, ...",774.10
1052350,1052350,a37e1fb026b80a6d_1,a37e1fb026b80a6d,"[331.69, 462.84, 417.31, 201.08]",RÖR,"[331.69, 466.97, 749.0, 462.84, 749.0, 659.79,...",83912.69
1052351,1052351,a37e1fb026b80a6d_2,a37e1fb026b80a6d,"[876.75, 285.63, 36.98, 10.28]",Moderna,"[876.75, 287.61, 912.34, 285.63, 913.73, 293.9...",380.15
1052352,1052352,a37e1fb026b80a6d_3,a37e1fb026b80a6d,"[913.53, 282.86, 32.23, 11.27]",Museet,"[913.53, 284.84, 944.77, 282.86, 945.76, 292.3...",363.23


---
# Data Cleaning

## Drop the first column

In [5]:
annotation_df.drop(columns=['Unnamed: 0'], inplace=True)

## Split the bbox

In [6]:
annotation_df['x1'] = annotation_df['bbox'].apply(lambda row: float(row.split(',')[0].replace('[', '').replace(']', '').strip()))
annotation_df['y1'] = annotation_df['bbox'].apply(lambda row: float(row.split(',')[1].replace('[', '').replace(']', '').strip()))

annotation_df['x2'] = annotation_df['bbox'].apply(lambda row: float(row.split(',')[2].replace('[', '').replace(']', '').strip()))
annotation_df['y2'] = annotation_df['bbox'].apply(lambda row: float(row.split(',')[3].replace('[', '').replace(']', '').strip()))

In [7]:
annotation_df

Unnamed: 0,id,image_id,bbox,utf8_string,points,area,x1,y1,x2,y2
0,a4ea732cd3d5948a_1,a4ea732cd3d5948a,"[525.83, 3.4, 197.64, 33.94]",Performance,"[525.83, 3.4, 723.47, 7.29, 722.76, 36.99, 525...",6707.90,525.83,3.40,197.64,33.94
1,a4ea732cd3d5948a_2,a4ea732cd3d5948a,"[534.67, 64.68, 91.22, 38.19]",Sport,"[535.73, 64.68, 623.41, 67.51, 625.89, 102.87,...",3483.69,534.67,64.68,91.22,38.19
2,a4ea732cd3d5948a_3,a4ea732cd3d5948a,"[626.95, 63.62, 96.52, 31.82]",Watch,"[626.95, 63.62, 721.7, 63.62, 723.47, 95.44, 6...",3071.27,626.95,63.62,96.52,31.82
3,a4ea732cd3d5948a_4,a4ea732cd3d5948a,"[577.4, 141.87, 147.13, 43.1]",...period.,"[580.02, 143.61, 724.53, 141.87, 723.66, 184.9...",6341.30,577.40,141.87,147.13,43.10
4,a4ea732cd3d5948a_5,a4ea732cd3d5948a,"[391.03, 163.9, 60.82, 38.65]",.,"[395.2, 163.9, 451.85, 191.94, 445.59, 202.55,...",2350.69,391.03,163.90,60.82,38.65
...,...,...,...,...,...,...,...,...,...,...
1052349,0ebbecdc46b78d42_15,0ebbecdc46b78d42,"[267.47, -0.14, 28.18, 27.47]",.,"[295.65, -0.14, 295.65, 27.33, 267.47, 27.03, ...",774.10,267.47,-0.14,28.18,27.47
1052350,a37e1fb026b80a6d_1,a37e1fb026b80a6d,"[331.69, 462.84, 417.31, 201.08]",RÖR,"[331.69, 466.97, 749.0, 462.84, 749.0, 659.79,...",83912.69,331.69,462.84,417.31,201.08
1052351,a37e1fb026b80a6d_2,a37e1fb026b80a6d,"[876.75, 285.63, 36.98, 10.28]",Moderna,"[876.75, 287.61, 912.34, 285.63, 913.73, 293.9...",380.15,876.75,285.63,36.98,10.28
1052352,a37e1fb026b80a6d_3,a37e1fb026b80a6d,"[913.53, 282.86, 32.23, 11.27]",Museet,"[913.53, 284.84, 944.77, 282.86, 945.76, 292.3...",363.23,913.53,282.86,32.23,11.27


## Clean utf8_string

In [8]:
annotation_df['utf8_string'] = annotation_df['utf8_string'].astype(str)

In [9]:
annotation_df['utf8_string'].value_counts()

utf8_string
.             337584
the            10969
of              7327
and             6470
to              5303
               ...  
31492              1
mls                1
+57                1
4:21               1
1/minx1000         1
Name: count, Length: 179570, dtype: int64

Remove '.' words as they are noise.

In [10]:
annotation_df = annotation_df[annotation_df['utf8_string'] != '.']

Standardize to lower case.

In [11]:
annotation_df['utf8_string'] = annotation_df['utf8_string'].apply(lambda word: word.lower() if isinstance(word, str) else word)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotation_df['utf8_string'] = annotation_df['utf8_string'].apply(lambda word: word.lower() if isinstance(word, str) else word)


Remove nonalphanumeric characters.

In [12]:
annotation_df['utf8_string'] = annotation_df['utf8_string'].apply(lambda word: ''.join(char for char in word if char.isalnum() or char.isspace()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotation_df['utf8_string'] = annotation_df['utf8_string'].apply(lambda word: ''.join(char for char in word if char.isalnum() or char.isspace()))


---
# Saved Cleaned Data

In [13]:
cleaned_annotation_df = annotation_df[['id', 'image_id', 'x1', 'y1', 'x2', 'y2', 'utf8_string', 'area']].copy()

In [14]:
cleaned_annotation_df

Unnamed: 0,id,image_id,x1,y1,x2,y2,utf8_string,area
0,a4ea732cd3d5948a_1,a4ea732cd3d5948a,525.83,3.40,197.64,33.94,performance,6707.90
1,a4ea732cd3d5948a_2,a4ea732cd3d5948a,534.67,64.68,91.22,38.19,sport,3483.69
2,a4ea732cd3d5948a_3,a4ea732cd3d5948a,626.95,63.62,96.52,31.82,watch,3071.27
3,a4ea732cd3d5948a_4,a4ea732cd3d5948a,577.40,141.87,147.13,43.10,period,6341.30
5,a4ea732cd3d5948a_6,a4ea732cd3d5948a,455.64,204.45,21.97,24.82,400,545.30
...,...,...,...,...,...,...,...,...
1052347,0ebbecdc46b78d42_13,0ebbecdc46b78d42,507.16,273.18,100.70,135.01,7,13595.51
1052348,0ebbecdc46b78d42_14,0ebbecdc46b78d42,387.77,149.05,104.33,127.75,8,13328.16
1052350,a37e1fb026b80a6d_1,a37e1fb026b80a6d,331.69,462.84,417.31,201.08,rör,83912.69
1052351,a37e1fb026b80a6d_2,a37e1fb026b80a6d,876.75,285.63,36.98,10.28,moderna,380.15


In [15]:
cleaned_annotation_df.to_csv(os.path.join(DATA_PATH, 'cleaned_annot.csv'), index=False)