In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Create an empty dataframe with columns 'lasers', 'v', 'w'
df = pd.DataFrame(columns=['lasers', 'v', 'w'])
num_files = 3
# Read the files dataset/data/i_data.tsv from 0 to range
for i in range(num_files):
    print(f'Reading dataset/data/{i}_data.tsv')
    data = pd.read_csv(f'dataset/data/{i}_data.tsv', sep='\t')
    data['lasers'] = data['lasers'].apply(lambda x: x.replace('(', '').replace(')', ''))
    data['lasers'] = data['lasers'].apply(lambda x: list(map(float, x.split(','))))
    # Append the data to the dataframe
    df = pd.concat([df, data], ignore_index=True)

Reading dataset/data/0_data.tsv
Reading dataset/data/1_data.tsv
Reading dataset/data/2_data.tsv


In [3]:
# Shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
# Create a new dataframe with the 'lasers' column as the values of the dataframe, with 180 columns (one for each laser)
lasers = pd.DataFrame(df['lasers'].values.tolist(), columns=[f'laser_{i}' for i in range(180)])
print(lasers.head())

#Check inf values
print(lasers.isin([np.inf, -np.inf]).sum())

# Describe the columns laser_176, laser_177, laser_178, laser_179
print(lasers[['laser_176', 'laser_177', 'laser_178', 'laser_179']].describe())

    laser_0   laser_1   laser_2   laser_3   laser_4   laser_5   laser_6  \
0  1.445742  1.447176  1.449060  1.451396  1.454188  1.457442  1.461160   
1  0.090631  0.090649  0.090694  0.082322  1.526117  1.530020  1.534413   
2  0.090631  0.090648  0.090693  0.082325  1.512301  1.513481  1.515123   
3  0.090747  0.090765  0.090811  0.081970  1.854123  1.855409  1.857268   
4  0.088716  0.088727  0.088765  0.088830  0.088923  0.089044  0.089192   

    laser_7   laser_8   laser_9  ...  laser_170  laser_171  laser_172  \
0  1.465352  1.470016  1.475166  ...   1.667622   1.664600   1.662099   
1  1.539303  1.544705  1.550620  ...   1.596252   1.593861   1.591969   
2  1.517243  1.519831  1.522900  ...   1.620029   1.614709   1.609917   
3  1.859704  1.862720  1.866454  ...   1.273189   1.268899   1.265020   
4  0.089369  0.089573  0.089807  ...   0.092485   0.092236   0.092016   

   laser_173  laser_174  laser_175  laser_176  laser_177  laser_178  laser_179  
0   1.660122   1.658658   1.6

In [5]:
# Change the inf values to 20
lasers = lasers.replace([np.inf, -np.inf], 20)

# Save the new dataframe to a file named 'data/lasers.tsv'
lasers.to_csv('dataset/shuffle_lasers.tsv', sep='\t', index=False)

In [6]:
# Read the w column from the original dataset
w = df['w']

print(w.describe())

# Save the w column to a file named 'data/w.tsv'
w.to_csv('dataset/shuffle_w.tsv', sep='\t', index=False)

count    12022.000000
mean         0.074695
std          1.122612
min         -1.869818
25%         -1.207970
50%          0.587583
75%          1.138162
max          1.696602
Name: w, dtype: float64


In [7]:
X = lasers
y = w

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)
X_train.to_csv('dataset/X_train_shuffled.tsv', sep='\t', index=False)
X_test.to_csv('dataset/X_test_shuffled.tsv', sep='\t', index=False)
y_train.to_csv('dataset/y_train_shuffled.tsv', sep='\t', index=False)
y_test.to_csv('dataset/y_test_shuffled.tsv', sep='\t', index=False)