In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from statsmodels.tsa.seasonal import seasonal_decompose
import folium
from folium.plugins import HeatMap
from branca.colormap import linear
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.decomposition import FastICA
from sklearn.preprocessing import StandardScaler
import plotly.express as px

# Zła ścieżka bezwzględna do pliku - zalecana poprawa na względną

In [None]:
X_train=pd.read_csv('X_test.csv')

In [None]:
X_train.head()

In [None]:
X_train.columns

# Data interpretation
### Time
- **time**: Time when the data was collected.

### Orientation
- **seconds_elapsed_orientation**: Time elapsed since the orientation measurement started.
- **qz_orientation, qy_orientation, qx_orientation, qw_orientation**: Components of the quaternion representing the device's orientation in 3D space.
- **roll_orientation, pitch_orientation, yaw_orientation**: Angles representing the roll, pitch, and yaw of the device.

### Location
- **seconds_elapsed_location**: Time elapsed since the location measurement started.
- **bearingAccuracy_location**: Accuracy of bearing measurement.
- **speedAccuracy_location**: Accuracy of speed measurement.
- **verticalAccuracy_location, horizontalAccuracy_location**: Accuracy of vertical and horizontal location measurement.
- **speed_location**: Speed of the device's movement.
- **bearing_location**: Direction of device movement.
- **altitude_location**: Altitude above sea level.
- **longitude_location, latitude_location**: Geographic coordinates.

### Total Acceleration
- **seconds_elapsed_total_acceleration**: Duration of total acceleration measurement.
- **z_total_acceleration, y_total_acceleration, x_total_acceleration**: Components of the total acceleration in the XYZ axes.

### Magnetometer
- **seconds_elapsed_magnetometer**: Duration of magnetometer measurement.
- **z_magnetometer, y_magnetometer, x_magnetometer**: Components of the magnetic field in the XYZ axes.

### Gyroscope
- **seconds_elapsed_gyroscope**: Duration of gyroscope measurement.
- **z_gyroscope, y_gyroscope, x_gyroscope**: Components of angular velocity in the XYZ axes.

### Pedometer
- **seconds_elapsed_pedometer**: Duration of pedometer measurement.
- **steps_pedometer**: Number of steps taken by the user.

### GPS Location
- **seconds_elapsed_location_gps**: Duration of GPS location measurement.
- **bearingAccuracy_location_gps, speedAccuracy_location_gps, verticalAccuracy_location_gps, horizontalAccuracy_location_gps**: Accuracy parameters of GPS location measurement.
- **speed_location_gps, bearing_location_gps, altitude_location_gps, longitude_location_gps, latitude_location_gps**: GPS location parameters.

### Network Location
- **seconds_elapsed_location_network**: Duration of network location measurement.
- **bearingAccuracy_location_network, speedAccuracy_location_network, verticalAccuracy_location_network, horizontalAccuracy_location_network**: Accuracy parameters of network location measurement.
- **speed_location_network, bearing_location_network, altitude_location_network, longitude_location_network, latitude_location_network**: Network location parameters.

### Gravity
- **seconds_elapsed_gravity**: Duration of gravity measurement.
- **z_gravity, y_gravity, x_gravity**: Components of the gravitational vector in the XYZ axes.


## Stats and distributions

In [None]:
summary_stats = X_train.describe(include='all')
summary_stats

In [None]:
selected_columns = ['qz_orientation', 'z_total_acceleration', 'z_gravity', 'z_magnetometer', 'z_accelerometer', 'z_gyroscope']
summary_table = summary_stats.loc[:, selected_columns]
print(summary_table)

In [None]:
plt.figure(figsize=(20, 40))

for i, column in enumerate(X_train.columns, 1):
    plt.subplot(11, 6, i)
    plt.hist(X_train[column], bins=20, color='skyblue', edgecolor='black')
    plt.title(column)
    plt.xlabel('Value')
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
#looking for outliers
plt.figure(figsize=(20, 40))

for i, column in enumerate(X_train.columns, 1):
    plt.subplot(11, 6, i)
    sns.boxplot(data=X_train[column])
    plt.title(column)
    plt.tight_layout()

plt.show()

# Widac outliery - zalecane usuniecie ich

In [None]:
['speed_location_accuracy']

In [None]:
#correlation matrix
correlation=X_train.corr()

plt.figure(figsize=(20, 20))
sns.heatmap(correlation, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
print("The number of unique values of a variable bearingAccuracy_location_network:", X_train["bearingAccuracy_location_network"].nunique())
print("The number of unique values of a variable speedAccuracy_location_network:", X_train["speedAccuracy_location_network"].nunique())
print("The number of unique values of a variable speed_location_network:", X_train["speed_location_network"].nunique())
print("The number of unique values of a variable bearing_location_network:", X_train["bearing_location_network"].nunique())

We see that the "bearingAccuracy_location_network", "speedAccuracy_location_network", "speed_location_network", "bearing_location_network" variables contain one unique value, which is 0, so they can be removed

In [None]:
#columns_to_remove = ["bearingAccuracy_location_network", "speedAccuracy_location_network", "speed_location_network", "bearing_location_network"]
#X_train = X_train.drop(columns_to_remove, axis=1)

In [None]:
X_train[["seconds_elapsed_orientation", "seconds_elapsed_location", "seconds_elapsed_total_acceleration", "seconds_elapsed_magnetometer",
        "seconds_elapsed_accelerometer", "seconds_elapsed_location_gps", "seconds_elapsed_gyroscope", "seconds_elapsed_pedometer",
         "seconds_elapsed_location_network", "seconds_elapsed_gravity"]]


The columns showing the time elapsed since the start of the measurement on different devices show almost the same data, which can be seen in the correlation matrix. The exceptions are the "seconds_elapsed_location_network" and "seconds_elapsed_pedometer" columns, which are different from the others. This is probably due to the device's lack of Internet access and inaccurate step measurement.
We will average the values of the remaining columns to reduce the dimensions of the matrix.

In [None]:
#X_train["seconds_elapsed"] = X_train[["seconds_elapsed_orientation", "seconds_elapsed_location", "seconds_elapsed_total_acceleration", "seconds_elapsed_magnetometer",
#        "seconds_elapsed_accelerometer", "seconds_elapsed_location_gps", "seconds_elapsed_gyroscope", "seconds_elapsed_gravity"]].mean(axis=1)
#
#X_train = X_train.drop(["seconds_elapsed_orientation", "seconds_elapsed_location", "seconds_elapsed_total_acceleration", "seconds_elapsed_magnetometer",
#        "seconds_elapsed_accelerometer", "seconds_elapsed_location_gps", "seconds_elapsed_gyroscope", "seconds_elapsed_gravity"], axis=1)


In [None]:
X_train.head()

In [None]:
speed_location = X_train['speed_location']
speed_location_gps = X_train['speed_location_gps']

plt.figure(figsize=(8, 6))
plt.scatter(speed_location, speed_location_gps, alpha=0.5)
plt.xlabel('Speed Location')
plt.ylabel('Speed Location GPS')
plt.title('Comparison of Speed Location and Speed Location GPS')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 4))
plt.hist(speed_location, bins=20, alpha=0.5, label='Speed Location')
plt.hist(speed_location_gps, bins=20, alpha=0.5, label='Speed Location GPS')
plt.xlabel('Speed')
plt.ylabel('Frequency')
plt.title('Histogram of Speed Location and Speed Location GPS')
plt.legend()
plt.grid(True)
plt.show()

## Timeseries

In [None]:
time_to_date= pd.to_datetime(X_train['time'], unit='ns')
time_to_date.describe()

In [None]:
X_train.columns

In [None]:
decomp_df = pd.DataFrame()

selected_columns=['qz_orientation', 'qy_orientation', 'qx_orientation', 'speed_location', 'bearing_location', 'altitude_location', 'longitude_location', 'latitude_location', 
                  'z_total_acceleration', 'y_total_acceleration', 'x_total_acceleration']

for col in selected_columns:
    result = seasonal_decompose(X_train[col], model='additive', period=1)
    temp_df = pd.DataFrame({
        'Date': X_train.index,
        'Observed': result.observed,
        'Trend': result.trend,
        'Column': col
    })
    decomp_df = pd.concat([decomp_df, temp_df], ignore_index=True)

decomp_melted = decomp_df.melt(id_vars=['Date', 'Column'], value_vars=['Observed', 'Trend'], var_name='Component', value_name='Value')

fig = px.line(decomp_melted, x='Date', y='Value', color='Component', facet_col='Column', facet_col_wrap=3, height=800)
fig.update_layout(title_text='Seasonal Decomposition of Time Series (Observed and Trend)', title_x=0.5)
fig.show()


In [None]:
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000

def downsample_data(data, target_size=1000):
    ratio = len(data) / target_size
    if ratio <= 1:
        return data
    else:
        return data.iloc[::int(ratio)]

decomposed_data = {}

X_train_downsampled = X_train.apply(downsample_data)

for column in X_train_downsampled.columns:
    decomposition = seasonal_decompose(X_train_downsampled[column], model='additive', period=1)
    decomposed_data[column] = decomposition

for column in decomposed_data.keys():
    plt.figure(figsize=(12, 10))

    plt.subplot(4, 1, 1)
    decomposed_data[column].trend.plot()
    plt.title(f'Trend Component - {column}')
    plt.xlabel('Time')
    plt.ylabel('Trend Component')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(12, 10))
    plt.subplot(4, 1, 2)
    decomposed_data[column].observed.plot()
    plt.title(f'Observed Data - {column}')
    plt.xlabel('Time')
    plt.ylabel('Observed Data')
    plt.tight_layout()
    plt.show()



# brak wyjasnienia wykresow i brak wyciagnietych zadnych wnioskow z nich - zbedne miejsce

## Geovisualization

In [None]:

m = folium.Map(location=[X_train['latitude_location'].mean(), X_train['longitude_location'].mean()], zoom_start=10) 

northmost_point = X_train.loc[X_train['latitude_location'].idxmax(), ['latitude_location', 'longitude_location']]
southmost_point = X_train.loc[X_train['latitude_location'].idxmin(), ['latitude_location', 'longitude_location']]
eastmost_point = X_train.loc[X_train['longitude_location'].idxmax(), ['latitude_location', 'longitude_location']]
westmost_point = X_train.loc[X_train['longitude_location'].idxmin(), ['latitude_location', 'longitude_location']]

#50 random data points
for _ in range(50):
    random_record = X_train.sample()
    longitude = random_record['longitude_location'].values[0]
    latitude = random_record['latitude_location'].values[0]

    folium.Marker([latitude, longitude], tooltip='Random Location').add_to(m)

folium.Marker([northmost_point['latitude_location'], northmost_point['longitude_location']], tooltip='Northmost Point', icon=folium.Icon(color='red')).add_to(m)
folium.Marker([southmost_point['latitude_location'], southmost_point['longitude_location']], tooltip='Southmost Point', icon=folium.Icon(color='red')).add_to(m)
folium.Marker([eastmost_point['latitude_location'], eastmost_point['longitude_location']], tooltip='Eastmost Point', icon=folium.Icon(color='red')).add_to(m)
folium.Marker([westmost_point['latitude_location'], westmost_point['longitude_location']], tooltip='Westmost Point', icon=folium.Icon(color='red')).add_to(m)

m.save("map.html")
m


### Heatmap

In [None]:
map_heatmap = folium.Map(location=[X_train['latitude_location'].mean(), X_train['longitude_location'].mean()], zoom_start=10)

location_data = list(zip(X_train['latitude_location'], X_train['longitude_location']))

heatmap = HeatMap(location_data)
map_heatmap.add_child(heatmap)

map_heatmap.save("heatmap.html")

### Speed map

In [None]:
sampled_data = X_train[::50]

map_speed = folium.Map(location=[sampled_data['latitude_location'].mean(), sampled_data['longitude_location'].mean()], zoom_start=10)

speed_colormap = linear.YlOrRd_09.scale(min(sampled_data['speed_location']), max(sampled_data['speed_location']))
speed_colormap.caption = 'Speed (m/s)'

for lat, lon, speed in zip(sampled_data['latitude_location'], sampled_data['longitude_location'], sampled_data['speed_location']):
    folium.CircleMarker(
        location=[lat, lon],
        radius=5,  
        color=None,  
        fill=True,
        fill_color=speed_colormap(speed), 
        fill_opacity=0.7,
        popup=f'Speed: {speed} m/s' 
    ).add_to(map_speed)

map_speed.add_child(speed_colormap)

map_speed.save("speed_map_gradient.html")

In [None]:
n = 10
theta = X_train['bearing_location'][::n] * np.pi / 180
r = X_train['seconds_elapsed_location'][::n]

theta_gps = X_train['bearing_location_gps'][::n] * np.pi / 180
r_gps = X_train['seconds_elapsed_location'][::n]

plt.figure(figsize=(20, 8))

ax1 = plt.subplot(121, projection='polar')
ax1.scatter(theta, r, color='b', alpha=0.5, label='Bearing Location')
ax1.set_title('Bearing Location over Time (Sampled)', va='bottom')
ax1.legend(loc='upper right')

ax2 = plt.subplot(122, projection='polar')
ax2.scatter(theta_gps, r_gps, color='r', alpha=0.5, label='Bearing Location GPS')
ax2.set_title('Bearing Location GPS over Time (Sampled)', va='bottom')
ax2.legend(loc='upper right')

plt.show()

In [None]:
X_train.columns

# Blad - cos jest nie tak z kodem, wykresy nic nie wnoszą

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X_train['seconds_elapsed_location'], X_train['speed_location'], color='blue', alpha=0.7)
plt.xlabel('Time (seconds)')
plt.ylabel('Speed (m/s)')
plt.title('Speed vs. Time')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
plt.scatter(X_train['seconds_elapsed_location'], X_train['x_total_acceleration'], color='red', label='X Acceleration', alpha=0.7)
plt.scatter(X_train['seconds_elapsed_location'], X_train['y_total_acceleration'], color='green', label='Y Acceleration', alpha=0.7)
plt.scatter(X_train['seconds_elapsed_location'], X_train['z_total_acceleration'], color='blue', label='Z Acceleration', alpha=0.7)
plt.xlabel('Time (seconds)')
plt.ylabel('Total Acceleration (m/s^2)')
plt.title('Total Acceleration vs. Time')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))

plt.plot(X_train['seconds_elapsed_pedometer'], X_train['steps_pedometer'], label='Steps', color='blue')

plt.xlabel('Time')
plt.ylabel('Steps')
plt.title('Steps over Time')
plt.legend()
plt.grid(True)
plt.show()

The step count versus time graph shows negative time. This is due to the step measurement device not starting correctly. However, it can be seen that the number of steps increases linearly with time, which suggests a constant speed during walking.

## PCA

# Zalecane zrobienie interpretacji wariancji przed wyborem n_components

# Ogromny błąd - nieznormalizowanie danych przed PCA

In [None]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_train)

pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
print(pca_df.sample(5))

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC1', y='PC2', data=pca_df)
plt.title('PCA Visualization of Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

In [None]:
print(pca.explained_variance_ratio_)

### Possible Reasons for Such Result in PCA:

### High Variable Correlation:
If variables are highly correlated, PCA may "capture" most of the variance in one component.

### Lack of Data Diversity:
If data lacks diversity or has low variability, PCA may struggle to detect significant differences between samples.

Given the nature of the data being measurements from several sensors, they are likely to be similar, with values close to each other. Additionally, some variables are highly correlated due to repetitive input data.

After analyzing correlations and dependencies between variables and conducting column reduction, PCA will be performed again.

In [None]:
pca = PCA(n_components=X_train.shape[1])
pca.fit(X_train)

cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
optimal_num_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1

print(f"Optimal number of components to explain 95% variance: {optimal_num_components}")
print("Explained variance ratio for first few components:")
for i in range(5):
    print(f"PC{i+1}: {pca.explained_variance_ratio_[i]}")

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

ica = FastICA(n_components=2, random_state=42)
X_train_ica = ica.fit_transform(X_train_scaled)

ica_df = pd.DataFrame(data=X_train_ica, columns=['IC1', 'IC2'])

print(ica_df.sample(5))

plt.figure(figsize=(10, 6))
sns.scatterplot(x='IC1', y='IC2', data=ica_df)
plt.title('ICA Visualization of Data')
plt.xlabel('Independent Component 1')
plt.ylabel('Independent Component 2')
plt.show()

mixing_matrix = pd.DataFrame(ica.mixing_, columns=['IC1', 'IC2'])
print("Mixing Matrix:")
print(mixing_matrix)