In [109]:
import pandas as pd

# Build a Big Dataset

Problem: <br>
The Toronto Identification Dataset is splitted into multiple csv files. This notebook tries to build a big convenient dataset for classification purposes.

In [110]:
# meta-data
data_types = ['cardboard', 'foam', 'metal', 'plastic', 'wooden-cabinet', 'wooden-shelf']
meas_numbers = 6
type_encoding = {
    'nothing':0,
    'cardboard':1, 
    'foam':2, 
    'metal':3, 
    'plastic':4, 
    'wooden-cabinet':5, 
    'wooden-shelf':6
}

In [111]:
# collecting all relevant files
df_list = []
for data_type in data_types:
    for num in range(1,meas_numbers):
        path = f"../../my_data/identification-dataset/nlos/anTag/{data_type}/data{num}/{data_type}-anTag-data{num}_data.csv"
        df = pd.read_csv(path)
        df['material'] = data_type
        df_list.append(df)

In [112]:
# splitting and relabeling the data
splitted_df_list = []
for df in df_list:
    df_an1 = df[['tdoa12', 'snr_an1', 'power_dif_an1', 'an1_rx_snr', 'an1_rx_powerdif', 'an1_tof', 'material']].copy()
    df_an2 = df[['tdoa21', 'snr_an2', 'power_dif_an2', 'an2_rx_snr', 'an2_rx_powerdif', 'an2_tof', 'material']].copy()

    df_an1.rename({'tdoa12':'tdoa', 
                'snr_an1':'snr_an', 
                'power_dif_an1':'power_dif', 
                'an1_rx_snr':'rx_snr', 
                'an1_rx_powerdif':'rx_powerdif', 
                'an1_tof':'tof'}, axis=1, inplace=True)
    df_an1['NLOS_material'] = type_encoding[df_an1['material'].unique()[0]]

    splitted_df_list.append(df_an1)
    
    df_an2.rename({'tdoa21':'tdoa',
                'snr_an2':'snr_an', 
                'power_dif_an2':'power_dif', 
                'an2_rx_snr':'rx_snr', 
                'an2_rx_powerdif':'rx_powerdif', 
                'an2_tof':'tof'}, axis=1, inplace=True)
    df_an2['NLOS_material'] = type_encoding['nothing']

    splitted_df_list.append(df_an2)

In [113]:
# builing the final df
df = pd.concat(splitted_df_list)
df.drop(columns=['material'])
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160260 entries, 0 to 2662
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tdoa           160260 non-null  float64
 1   snr_an         160260 non-null  float64
 2   power_dif      160213 non-null  float64
 3   rx_snr         119746 non-null  float64
 4   rx_powerdif    119744 non-null  float64
 5   tof            119746 non-null  float64
 6   material       160260 non-null  object 
 7   NLOS_material  160260 non-null  int64  
dtypes: float64(6), int64(1), object(1)
memory usage: 11.0+ MB


In [114]:
# storing the df 
df.to_csv('big-identification-dataset.csv', index=False,  encoding='utf-8')