In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns


In [31]:
file_paths = [
    'C:/Users/daral/OneDrive - Atlantic TU/year 4/Gesture UI/project1/peugeot_207_02.csv',
    'C:/Users/daral/OneDrive - Atlantic TU/year 4/Gesture UI/project1/peugeot_207_01.csv',
    'C:/Users/daral/OneDrive - Atlantic TU/year 4/Gesture UI/project1/opel_corsa_01.csv',
    'C:/Users/daral/OneDrive - Atlantic TU/year 4/Gesture UI/project1/opel_corsa_02.csv',
    ]

# Function to get the columns of a CSV file
def get_columns(file_path):
    return pd.read_csv(file_path, delimiter=';', encoding='latin1', nrows=0)

# Dictionary to hold file paths and their respective columns
file_columns = {file_path: get_columns(file_path).columns.tolist() for file_path in file_paths}

In [32]:
# Extract the columns of the first file to use as a reference
reference_columns = file_columns[file_paths[0]]

# Function to compare two lists
def lists_match(list1, list2):
    return list1 == list2

# Check if all files have the same structure
all_match = all(lists_match(reference_columns, cols) for cols in file_columns.values())

print(f"All files have the same structure: {all_match}")


All files have the same structure: True


In [33]:
# concatenate all CSV files into a single DataFrame
def concatenate_csv_files(file_paths):
    dfs = []  # List to store DataFrames
    for file_path in file_paths:
        df = pd.read_csv(file_path, delimiter=';', encoding='latin1')  # Load each file into a DataFrame
        dfs.append(df)  # Append the DataFrame to the list
    combined_df = pd.concat(dfs, ignore_index=True)  # Concatenate all DataFrames
    return combined_df

# concatenate all your files
combined_df = concatenate_csv_files(file_paths)

#  inspect the combined DataFrame
print(combined_df.head())  # Print the first 5 rows of the combined dataframe

   Unnamed: 0  AltitudeVariation  VehicleSpeedInstantaneous  \
0          59          -0.599998                  24.299999   
1          60           0.099998                  27.900000   
2          61          -0.200001                  30.599998   
3          62          -0.899998                  29.699999   
4          63          -0.900002                  28.799999   

   VehicleSpeedAverage  VehicleSpeedVariance  VehicleSpeedVariation  \
0             8.720689             46.843772               6.299999   
1             9.045762             52.270798               3.600000   
2             9.405000             59.127937               2.699999   
3             9.900000             64.360674              -0.900000   
4            10.380000             68.519589              -0.900000   

   LongitudinalAcceleration  EngineLoad  EngineCoolantTemperature  \
0                   -1.2903   67.058823                      26.0   
1                   -1.4038    0.000000                 

This method helps identify columns with missing values and gives you an insight into the datatypes of each column.


In [34]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446 entries, 0 to 4445
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 4446 non-null   int64  
 1   AltitudeVariation          4446 non-null   float64
 2   VehicleSpeedInstantaneous  4440 non-null   float64
 3   VehicleSpeedAverage        4446 non-null   float64
 4   VehicleSpeedVariance       4446 non-null   float64
 5   VehicleSpeedVariation      4446 non-null   float64
 6   LongitudinalAcceleration   4446 non-null   float64
 7   EngineLoad                 4446 non-null   float64
 8   EngineCoolantTemperature   4446 non-null   int64  
 9   ManifoldAbsolutePressure   4446 non-null   int64  
 10  EngineRPM                  4446 non-null   float64
 11  MassAirFlow                4446 non-null   float64
 12  IntakeAirTemperature       4446 non-null   int64  
 13  VerticalAcceleration       4446 non-null   float

The .describe() method shows a quick statistic summary of your numerical data.

In [35]:
df.describe()


Unnamed: 0.1,Unnamed: 0,AltitudeVariation,VehicleSpeedInstantaneous,VehicleSpeedAverage,VehicleSpeedVariance,VehicleSpeedVariation,LongitudinalAcceleration,EngineLoad,EngineCoolantTemperature,ManifoldAbsolutePressure,EngineRPM,MassAirFlow,IntakeAirTemperature,VerticalAcceleration,FuelConsumptionAverage
count,4446.0,4446.0,4440.0,4446.0,4446.0,4446.0,4446.0,4446.0,4446.0,4446.0,4446.0,4446.0,4446.0,4446.0,4446.0
mean,2333.006073,0.100135,22.962508,22.975994,137.937403,-0.014014,-0.137473,39.771728,62.05623,105.55623,1138.302632,11.446912,20.624606,-0.150632,16.710177
std,1323.814689,1.797515,18.623617,14.42431,132.251407,2.258688,0.758267,25.467201,18.477506,5.390143,389.589388,5.967084,5.206478,0.560713,4.136263
min,59.0,-9.199997,0.0,0.0,0.0,-31.072817,-3.065,0.0,15.0,96.0,0.0,0.88,11.0,-2.5109,10.344559
25%,1170.25,-0.599998,5.4,11.39625,48.322445,-0.9,-0.5635,27.843138,47.0,102.0,780.5,5.58,17.0,-0.453725,12.978312
50%,2322.5,0.0,22.5,21.021283,103.170886,0.0,-0.217,36.862747,66.0,103.0,1060.5,10.36,20.0,-0.051,15.995884
75%,3467.75,0.900002,36.899998,35.459999,174.184256,0.9,0.307475,60.0,79.0,107.0,1473.5,16.629999,24.0,0.1961,20.450338
max,4622.0,11.400002,72.0,59.984998,864.046635,30.599998,2.2448,100.0,86.0,144.0,2239.0,30.99,41.0,1.5015,30.672386


In [36]:
#df['drivingStyle'].value_counts()

drop rows with any missing values:

In [37]:
# df = df.dropna()


fill missing values with the median of the column:

In [38]:
# df = df.fillna(df.median())