In [38]:
import pandas as pd
import numpy as np

In [39]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [40]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [41]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [42]:
from datetime import datetime

In [43]:
from scipy.fft import fft

In [44]:
# Load the dataset

In [45]:
file_path = 'I:\CSE 499 Data Preprocess\Train_1.csv'

In [46]:
data = pd.read_csv(file_path)

In [47]:
# Display the first few rows of the dataset
print(data.head())
print(data.info())

   Unnamed: 0      time       x       y       z   Latitude   Longitude
0         0.0  0.000453  2.1292  4.1400  1.5651  23.766323  90.3567911
1         1.0  0.010498  4.2303  2.1790  4.6071  23.766345   90.356765
2         2.0  0.020455  5.8740  0.2259  5.5304  23.766329  90.3566848
3         3.0  0.030455  6.6837 -2.0443  3.2507  23.766326  90.3566339
4         4.0  0.040455  6.6045 -4.1571  0.9183  23.766952  90.3566392
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  4059 non-null   float64
 1   time        4059 non-null   float64
 2   x           4059 non-null   float64
 3   y           4059 non-null   float64
 4   z           4059 non-null   float64
 5   Latitude    4059 non-null   float64
 6   Longitude   4059 non-null   object 
dtypes: float64(6), object(1)
memory usage: 355.6+ KB
None


In [48]:
# Display basic statistics
data.describe()

Unnamed: 0.1,Unnamed: 0,time,x,y,z,Latitude
count,4059.0,4059.0,4059.0,4059.0,4059.0,4059.0
mean,2029.0,20.290463,0.004477,0.455471,-0.766847,23.767244
std,1171.8767,11.718766,1.745974,2.683144,2.838453,0.000663
min,0.0,0.000453,-5.7073,-7.0086,-15.7191,23.765891
25%,1014.5,10.145476,-1.12985,-1.27675,-2.76055,23.766532
50%,2029.0,20.290197,-0.1017,0.4021,-0.9651,23.767269
75%,3043.5,30.435453,0.99015,2.176,1.05195,23.767748
max,4058.0,40.580455,10.0488,17.676599,11.9389,23.76897


In [49]:
# Finding missing values
pd.isnull(data).head()

Unnamed: 0.1,Unnamed: 0,time,x,y,z,Latitude,Longitude
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False


In [50]:
# Check for missing values
missing_values = data.isnull().sum()
missing_values

Unnamed: 0    2441
time          2441
x             2441
y             2441
z             2441
Latitude      2441
Longitude     2441
dtype: int64

In [51]:
# Select rows with missing data
data[pd.isnull(data).any(axis=1)]

Unnamed: 0.1,Unnamed: 0,time,x,y,z,Latitude,Longitude
4059,,,,,,,
4060,,,,,,,
4061,,,,,,,
4062,,,,,,,
4063,,,,,,,
...,...,...,...,...,...,...,...
6495,,,,,,,
6496,,,,,,,
6497,,,,,,,
6498,,,,,,,


In [52]:
# Handle missing values, e.g., by dropping or imputing
data = data.dropna()

In [53]:
# Normalize the accelerometer data
scaler = StandardScaler()
data[['acc_x', 'acc_y', 'acc_z']] = scaler.fit_transform(data[['x', 'y', 'z']])

In [54]:
# Create new features such as magnitude of acceleration
data['accel_magnitude'] = (data['x']**2 + data['y']**2 + data['z']**2)**0.5

In [55]:
# Clean the dataset
data.drop_duplicates(inplace=True)

In [56]:
# Handle outliers using IQR
Q1 = data[['acc_x', 'acc_y', 'acc_z']].quantile(0.25)
Q3 = data[['acc_x', 'acc_y', 'acc_z']].quantile(0.75)
IQR = Q3 - Q1
data = data[~((data[['acc_x', 'acc_y', 'acc_z']] < (Q1 - 1.5 * IQR)) |(data[['acc_x', 'acc_y', 'acc_z']] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Display the cleaned dataset
print(data.head())


   Unnamed: 0      time       x       y       z   Latitude   Longitude  \
0         0.0  0.000453  2.1292  4.1400  1.5651  23.766323  90.3567911   
5         5.0  0.050497  2.9136 -3.6812 -2.8109  23.766733  90.3566838   
6         6.0  0.060458 -1.4917 -1.3324 -5.2929  23.766930  90.3566928   
7         7.0  0.070453 -3.2652  0.3540 -6.1232  23.766820  90.3566281   
8         8.0  0.080452 -2.4945  0.2309 -4.0987  23.766894  90.3566893   

      acc_x     acc_y     acc_z  accel_magnitude  
0  1.217077  1.373383  0.821657         4.911479  
5  1.666395 -1.541915 -0.720218         5.471879  
6 -0.857035 -0.666417 -1.594746         5.658202  
7 -1.872926 -0.037823 -1.887300         6.948412  
8 -1.431456 -0.083707 -1.173972         4.803664  
