In [25]:
import pandas as pd

# Baseline MLP Binary Classification Experiment on Big Data

In [26]:
PATH = "../../my_data/identification-dataset/my_custom_data/big-identification-dataset.csv"
df = pd.read_csv(PATH)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160260 entries, 0 to 160259
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tdoa           160260 non-null  float64
 1   snr_an         160260 non-null  float64
 2   power_dif      160213 non-null  float64
 3   rx_snr         119746 non-null  float64
 4   rx_powerdif    119744 non-null  float64
 5   tof            119746 non-null  float64
 6   material       160260 non-null  object 
 7   NLOS_material  160260 non-null  int64  
dtypes: float64(6), int64(1), object(1)
memory usage: 9.8+ MB


In [28]:
df.head()

Unnamed: 0,tdoa,snr_an,power_dif,rx_snr,rx_powerdif,tof,material,NLOS_material
0,-0.211129,145.678574,10.307022,215.300003,11.049919,5.009119,cardboard,1
1,-0.197054,134.321426,10.824654,192.0,11.397888,5.027886,cardboard,1
2,-0.197054,134.321426,10.824654,236.149994,9.84436,5.06542,cardboard,1
3,-0.248663,104.10714,11.064209,236.149994,9.84436,5.06542,cardboard,1
4,-0.182979,113.25,10.954155,185.0,10.330307,5.027886,cardboard,1


In [29]:
df.describe()

Unnamed: 0,tdoa,snr_an,power_dif,rx_snr,rx_powerdif,tof,NLOS_material
count,160260.0,160260.0,160213.0,119746.0,119744.0,119746.0,160260.0
mean,-0.016624,120.816046,12.867739,198.812671,11.032414,5.043572,1.752352
std,2.262903,39.969214,3.813719,31.198747,1.248188,0.022086,2.127614
min,-373.351807,6.208333,-17.027435,0.118671,-35.275032,4.952817,0.0
25%,-0.211129,109.790176,11.160553,177.791672,10.306816,5.027886,0.0
50%,-0.011729,127.75,12.240959,198.449997,10.761261,5.041961,0.5
75%,0.211129,144.125,12.946327,222.399994,11.368462,5.056036,4.0
max,4.077143,326.200012,33.626175,330.0,36.156631,5.14518,6.0


In [30]:
df['NLOS_material'].unique()

array([1, 0, 2, 3, 4, 5, 6])

## Classification

In [32]:
# Import modules from Scikit-learn
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split   # Import train_test_split function
from sklearn import metrics   # import metrics modules for accuracy calculation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [33]:
# Create a pipeline object for the model
pipe_MLP = make_pipeline(StandardScaler(),
                         MLPClassifier(solver='adam',
                                       hidden_layer_sizes=(100,100,100,), # 3 hidden layers with (100x100x100) neurons
                                       random_state=0,
                                       max_iter=500,           # TODO: tune it later
                                       verbose=True
                                       )
                         )

In [34]:
# Extracted features 
X = df[['tdoa', 'snr_an', 'power_dif', 'rx_snr', 'rx_powerdif', 'tof', 'NLOS_material']]
y = df['NLOS_material'] # Labels

In [35]:
# Split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test

In [36]:
# Train the whole pipeline
pipe_MLP.fit(X_train, y_train)

Iteration 1, loss = 0.20224145
Iteration 2, loss = 0.00559494
Iteration 3, loss = 0.00334631
Iteration 4, loss = 0.00217787
Iteration 5, loss = 0.00138537
Iteration 6, loss = 0.00184994
Iteration 7, loss = 0.00024131
Iteration 8, loss = 0.00019795
Iteration 9, loss = 0.00016824
Iteration 10, loss = 0.00014952
Iteration 11, loss = 0.00015504
Iteration 12, loss = 0.00011742
Iteration 13, loss = 0.00011020
Iteration 14, loss = 0.00010493
Iteration 15, loss = 0.00010810
Iteration 16, loss = 0.00009944
Iteration 17, loss = 0.00009780
Iteration 18, loss = 0.00009567
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=500,
                               random_state=0, verbose=True))])

In [37]:
# Make predictions on the test data
y_pred = pipe_MLP.predict(X_test)

In [38]:
# Caluclate the accuracy on test data predicitons
print(f'Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100}%')

Test Accuracy: 100.0%
