In [6]:
import pandas as pd

# Load one of the datasets to get an overview
# We'll start with the Peugeot dataset as an example
df_peugeot_1 = pd.read_csv('peugeot_207_01.csv', delimiter=';')

# Display the first few rows of the dataset to understand its structure
df_peugeot_1.head()

Unnamed: 0.1,Unnamed: 0,AltitudeVariation,VehicleSpeedInstantaneous,VehicleSpeedAverage,VehicleSpeedVariance,VehicleSpeedVariation,LongitudinalAcceleration,EngineLoad,EngineCoolantTemperature,ManifoldAbsolutePressure,EngineRPM,MassAirFlow,IntakeAirTemperature,VerticalAcceleration,FuelConsumptionAverage,roadSurface,traffic,drivingStyle
0,59,-13.399994,20.699999,6.06,115.840061,-4.5,1.0548,38.039215,31.0,101.0,798.0,6.22,15.0,-0.5982,27.919697,SmoothCondition,LowCongestionCondition,EvenPaceStyle
1,60,-13.200005,17.1,6.345,117.200816,-3.599998,0.9012,38.039215,31.0,101.0,797.5,6.27,15.0,-0.5244,26.458355,SmoothCondition,LowCongestionCondition,EvenPaceStyle
2,61,-13.200005,14.4,6.585,117.559595,-2.700001,0.9331,90.588234,32.0,101.0,797.5,6.27,15.0,-0.5068,26.458355,SmoothCondition,LowCongestionCondition,EvenPaceStyle
3,62,-11.800003,14.4,6.825,117.801222,0.0,0.9878,90.588234,32.0,101.0,726.0,8.63,15.0,-0.4882,26.004126,SmoothCondition,LowCongestionCondition,EvenPaceStyle
4,63,-19.800003,14.4,7.065,117.925697,0.0,1.1316,92.941177,32.0,101.0,784.0,8.88,15.0,-0.6712,25.295498,SmoothCondition,LowCongestionCondition,EvenPaceStyle


In [7]:
from sklearn.preprocessing import LabelEncoder

# Remove rows with missing values
df_clean = df_peugeot_1.dropna()

# Encode the 'traffic' column
le = LabelEncoder()
df_clean['traffic_encoded'] = le.fit_transform(df_clean['traffic'])

# Display the first few rows of the cleaned dataset and the encoding
df_clean[['traffic', 'traffic_encoded']].head(), le.classes_


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['traffic_encoded'] = le.fit_transform(df_clean['traffic'])


(                  traffic  traffic_encoded
 0  LowCongestionCondition                1
 1  LowCongestionCondition                1
 2  LowCongestionCondition                1
 3  LowCongestionCondition                1
 4  LowCongestionCondition                1,
 array(['HighCongestionCondition', 'LowCongestionCondition',
        'NormalCongestionCondition'], dtype=object))

In [8]:
from sklearn.model_selection import train_test_split

# Selecting features for the model
# Excluding 'traffic' and 'drivingStyle' as they are target variables and 'roadSurface' which is not relevant for this task
features = df_clean.columns.drop(['Unnamed: 0', 'traffic', 'traffic_encoded', 'drivingStyle', 'roadSurface'])
X = df_clean[features]
y = df_clean['traffic_encoded']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets to confirm the split
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((6553, 14), (1639, 14), (6553,), (1639,))

In [9]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the models with default parameters
svm_model = SVC()
logistic_model = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence
knn_model = KNeighborsClassifier()

# Train the SVM model
svm_model.fit(X_train, y_train)
# Predictions on the test set
svm_predictions = svm_model.predict(X_test)
# SVM Evaluation
svm_accuracy = accuracy_score(y_test, svm_predictions)

# Train the Logistic Regression model
logistic_model.fit(X_train, y_train)
# Predictions on the test set
logistic_predictions = logistic_model.predict(X_test)
# Logistic Regression Evaluation
logistic_accuracy = accuracy_score(y_test, logistic_predictions)

# Train the kNN model
knn_model.fit(X_train, y_train)
# Predictions on the test set
knn_predictions = knn_model.predict(X_test)
# kNN Evaluation
knn_accuracy = accuracy_score(y_test, knn_predictions)

# Display the accuracy of each model
svm_accuracy, logistic_accuracy, knn_accuracy


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.825503355704698, 0.8913971934106162, 0.87614399023795)

In [10]:
# Detailed evaluation of the best performing model (Logistic Regression)
logistic_report = classification_report(y_test, logistic_predictions, target_names=le.classes_)

# Display the classification report
print(logistic_report)


                           precision    recall  f1-score   support

  HighCongestionCondition       0.76      0.77      0.77       138
   LowCongestionCondition       0.91      0.98      0.95      1353
NormalCongestionCondition       0.63      0.18      0.28       148

                 accuracy                           0.89      1639
                macro avg       0.77      0.64      0.66      1639
             weighted avg       0.87      0.89      0.87      1639



In [11]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Balancing the dataset using SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_scaled, y)

# Splitting the balanced and scaled dataset into training and testing sets
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Train the Logistic Regression model on the balanced and scaled data for comparison
logistic_model_balanced = LogisticRegression(max_iter=1000)
logistic_model_balanced.fit(X_train_balanced, y_train_balanced)

# Predictions and evaluation
logistic_predictions_balanced = logistic_model_balanced.predict(X_test_balanced)
logistic_accuracy_balanced = accuracy_score(y_test_balanced, logistic_predictions_balanced)

# Display the new accuracy of the Logistic Regression model after balancing and scaling
logistic_accuracy_balanced




0.7986838898367048

In [12]:
# Generating the classification report for the Logistic Regression model trained on the balanced and scaled dataset
classification_report_balanced = classification_report(y_test_balanced, logistic_predictions_balanced, target_names=le.classes_)

# Display the classification report
print(classification_report_balanced)


                           precision    recall  f1-score   support

  HighCongestionCondition       0.86      0.95      0.90      1359
   LowCongestionCondition       0.78      0.71      0.74      1353
NormalCongestionCondition       0.75      0.74      0.74      1391

                 accuracy                           0.80      4103
                macro avg       0.80      0.80      0.80      4103
             weighted avg       0.80      0.80      0.80      4103



: 