In [1]:
import pickle
import os
import re
import glob
import pandas as pd

In [2]:
# this is the path to the folder where you have the CSVs, NO OTHER CSVs SHOULD BE PRESENT
# please make sure this path is not inside the scope of GitHub so we do not go over on data for our repo
path = r'C:\RotorCraftData\CSV'
pattern = r'.*2023\.06\.15.*\.csv$'


In [3]:

# this imports a list of columns that was saved after the removal of variance on a single CSV, this list will be used to define which columns to read in
with open('./src/use_cols.pkl', 'rb') as f:
    use_cols = pickle.load(f)


In [4]:
# the data will be labeled using the information from the flight logs
label_table = pd.DataFrame({
    'Date': ['2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15', '2023-06-15'],  # Replace with actual dates of maneuvers
    # Replace with actual start time of maneuvers
    'StartTime': ['13:22:15.0', '13:25:25.0', '13:29:25.0', '11:56:25.0', '11:58:03.0', '11:59:51.0', '16:10:04.0', '16:11:41.0', '16:14:20.0', '13:43:12.0', '13:44:10.0', '13:45:19.0', '12:08:11.0', '12:09:31.0', '12:10:51.0', '16:34:28.0', '16:35:06.0', '16:38:26.0'],
    # Replace with actual end time of maneuvers
    'EndTime': ['13:22:25.0', '13:25:38.0', '13:29:40.0', '11:56:38.0', '11:58:24.0', '12:00:00.0', '16:10:12.0', '16:11:46.0', '16:14:29.0', '13:43:35.0', '13:44:18.0', '13:45:30.0', '12:08:35.0', '12:09:52.0', '12:11:18.0', '16:34:42.0', '16:35:27.0', '16:38:36.0'],
    'Label': ['Dynamic Rollover', 'Dynamic Rollover', 'Dynamic Rollover', 'Dynamic Rollover', 'Dynamic Rollover', 'Dynamic Rollover', 'Dynamic Rollover', 'Dynamic Rollover', 'Dynamic Rollover', 'LOW-G', 'LOW-G', 'LOW-G', 'LOW-G', 'LOW-G', 'LOW-G', 'LOW-G', 'LOW-G', 'LOW-G']  # Replace with maneuver names
})

# convert date, start time, and end time columns to datetime type
label_table['Date'] = pd.to_datetime(label_table['Date'])
label_table['StartTime'] = pd.to_datetime(
    label_table['StartTime'], format='%H:%M:%S.%f').dt.strftime('%H:%M:%S.%f')
label_table['EndTime'] = pd.to_datetime(
    label_table['EndTime'], format='%H:%M:%S.%f').dt.strftime('%H:%M:%S.%f')


def combine_csv_files(csv_directory, columns_to_use, label_df):
    # get list of CSV file paths in the directory
    csv_files = [os.path.join(csv_directory, filename) for filename in os.listdir(
        csv_directory) if re.match(pattern, filename)]
    # create an empty dataframe to store the combined data
    combined_df = pd.DataFrame()

    # iterate over each CSV file
    for file in csv_files:
        # read CSV file and select desired columns
        temp_df = pd.read_csv(file, usecols=columns_to_use, names=columns_to_use, skiprows=3, skipfooter=1, engine='python')
        # drop rows that Elapsed Time are mostly null, these are the breaks in simulation
        temp_df.dropna(subset=['Elapsed Time'], inplace=True)
        # temp_df.drop(['Elapsed Time'], inplace=True)
        temp_df.dropna(inplace=True)
        # concatenate the temporary dataframe with the running dataframe
        combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

    # convert the time column on original df to correct format
    combined_df['System UTC Time'] = pd.to_datetime(
    combined_df['System UTC Time'], format='%H:%M:%S.%f').dt.strftime('%H:%M:%S.%f')
    # convert the date column on original df to correct format
    combined_df['Date'] = pd.to_datetime(combined_df['Date'])
    
    # apply the labeling to the dataset
    for _, row in label_df.iterrows():
        # extract date, start time, and end time from the current row
        date = row['Date']
        start_time = row['StartTime']
        end_time = row['EndTime']
        label = row['Label']

        # filter the existing dataset based on matching date and within start time and end time
        filter_condition = (combined_df['Date'] == date) & (
            combined_df['System UTC Time'].between(start_time, end_time))
        combined_df.loc[filter_condition, 'Label'] = label
    dummies_df = pd.get_dummies(combined_df['Label'], dummy_na=False)
    dummies_df = dummies_df.astype(int)
    combined_df = pd.concat([combined_df, dummies_df], axis=1)
    combined_df.drop(['Elapsed Time', 'Date', 'System UTC Time'], inplace=True, axis=1)
    
    return combined_df


In [7]:

# this calls the function from above that cleans and creates dummy variables for our target variables
df = combine_csv_files(path, use_cols, label_table)
# dataframe is created with one column having the incorrect type
df['Altitude(MSL)'] = df['Altitude(MSL)'].astype('float')
# this will create a pickle file with the working dataframe in your directory with the original CSV files
# you will not need to run this script again, as we will load in the dataframe from the pickle file
with open(f'{path}/working_df.pkl', 'wb') as f:
    pickle.dump(df, f)

KeyError: 'Altitude(MSL)'

In [6]:
df.shape

(487470, 32)

# PCA analysis

In [8]:
#https://www.edureka.co/blog/principal-component-analysis/
#Step 1 - import required packages
import sklearn
from sklearn.preprocessing import StandardScaler
from matplotlib import rcParams
from sklearn.decomposition import PCA
from sklearn import decomposition
from sklearn.preprocessing import scale
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from matplotlib.cm import register_cmap
from scipy import stats
from sklearn.decomposition import PCA as sklearnPCA
import seaborn
%matplotlib inline

ModuleNotFoundError: No module named 'sklearn'

In [None]:
#Step 2- Select df. I will be using RCdf(136 var). That dataset has
# only numeric values and the date which I could convert to float. System UTC time could not be
#converted to float. I need to research more. 
RCdf.shape


In [None]:
#Step3 - formatting the data. It has been formatted already. Excluding response variables.  , 'label_0','label_Dynamic Rollover','label_LOW-G'.
# df2 has only float a#nd int data columns. Dropping object and uint8.
df2 = RCdf.drop(['label_Dynamic Rollover','label_LOW-G'], axis=1)

In [None]:
# Step 4: Standardization. It run using df2
X_std = StandardScaler().fit_transform(df2)
#X_std
pd.options.display.max_columns = None
print(X_std)

In [None]:
#Step 5: Compute covariance matrix.
#These values show the distribution magnitude and direction of multivariate data in a multidimensional space and 
#can allow you to gather information about how data spreads among two dimensions.
mean_vec = np.mean(X_std, axis=0)
cov_matrix = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
#print('Covariance matrix n%s' %cov_mat)
print(cov_matrix)

In [None]:
#Step 6: Calculate eigenvectors and eigenvalues
#Calculating eigenvectors and eigenvalues on covariance matrix.
#eigenvalue function returns two type of arrays, one dimensional array representing the eigenvalues in 
#the position of the input and another two dimensional array giving the eigenvector corresponding to the columns in the input matrix.
cov_matrix = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
print('Eigenvectors n%s' %eig_vecs)

In [None]:
print('nEigenvalues n%s' %eig_vals)

In [None]:
#Step 7: Compute the feature vector.  rearrange the eigenvalues in descending order. 
#This represents the significance of the principal components in descending order:
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])
#The top 10 eigenvalues adds up to 71% of the data. 

In [None]:
#Step 8: Use the PCA() function to reduce the dimensionality of the data set
pca = PCA()
pca.fit_transform(df2)
print(pca.explained_variance_ratio_)
len(pca.explained_variance_ratio_) 

In [None]:
#Step 9:Projecting the variance to the Principle Components
pca = PCA().fit(X_std)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title('Scree plot of commulative expalined variance to the PCs')
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.show()