<a href="https://colab.research.google.com/github/Ahmed2045/Tardiness-Prediction-Model/blob/main/TARDINESS_N.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC  # Import Support Vector Classifier

# Read the dataset from a CSV file into a pandas DataFrame
data = pd.read_csv('dataSET-1.csv')

# Define the features and target variable
features = ['weather', 'transportation', 'start', 'traffic', 'duration', 'previous tardiness']
target = 'tardiness'

# Extract features (X) and target variable (y) from the dataset
X = data[features]
y = data[target]

# Initialize a OneHotEncoder to handle categorical variables
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Perform one-hot encoding on categorical features and create a DataFrame for the encoded features
X_encoded = pd.DataFrame(encoder.fit_transform(X.select_dtypes(include='object')))
X_encoded.columns = encoder.get_feature_names_out(X.select_dtypes(include='object').columns)

# Concatenate the encoded features with the original numerical features
X = pd.concat([X.select_dtypes(include=['float64', 'int64']), X_encoded], axis=1)

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display the shapes of the training, validation, and test sets
print('Training set shape:', X_train.shape, y_train.shape)
print('Validation set shape:', X_val.shape, y_val.shape)
print('Test set shape:', X_test.shape, y_test.shape)

# Import the Support Vector Classifier model and accuracy metric
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize a Support Vector Classifier model
model = SVC()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the training set and calculate training accuracy
train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)

# Make predictions on the validation set and calculate validation accuracy
val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)

# Make predictions on the test set and calculate test accuracy
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)

# Display the training, validation, and test accuracies
print('Training Accuracy:', train_accuracy*100,"%")
print('Validation Accuracy:', val_accuracy*100,"%")
print('Test Accuracy:', test_accuracy*100,"%")

precision = precision_score(y_test, test_predictions, average='weighted')
recall = recall_score(y_test, test_predictions, average='weighted')
f1 = f1_score(y_test, test_predictions, average='weighted')

# Display precision, recall, and F1 score for the test set
print(' Precision is :', precision)
print(' Recall is :', recall)
print(' F1 Score is :', f1)

# Install joblib
!pip install joblib

# Import joblib for model saving
import joblib

# Save the trained model to a file
joblib.dump(model, 'saved_model.pkl')

# Load the saved model from the file
loaded_model = joblib.load('saved_model.pkl')

# Read new data from 'dataTEST.csv'  # for testing
new_data = pd.read_csv('dataTEST.csv')

# Perform one-hot encoding on the new data
new_data_encoded = pd.DataFrame(encoder.transform(new_data.select_dtypes(include='object')))
new_data_encoded.columns = encoder.get_feature_names_out(new_data.select_dtypes(include='object').columns)

# Concatenate the encoded features with the original numerical features in the new data
new_data = pd.concat([new_data.select_dtypes(include=['float64', 'int64']), new_data_encoded], axis=1)

# Make predictions on the new data using the loaded model
predictions = loaded_model.predict(new_data)

# Display the predictions
print(predictions)




Training set shape: (6999, 11) (6999,)
Validation set shape: (1500, 11) (1500,)
Test set shape: (1500, 11) (1500,)
Training Accuracy: 98.87126732390341 %
Validation Accuracy: 99.2 %
Test Accuracy: 98.73333333333333 %
 Precision is : 0.9875634161886553
 Recall is : 0.9873333333333333
 F1 Score is : 0.9872615718299335
[0 1]


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

# Read the dataset from a CSV file into a pandas DataFrame
data = pd.read_csv('dataSET-1.csv')

# Define the features and target variable
features = ['weather', 'transportation', 'start', 'traffic', 'duration', 'previous tardiness']
target = 'tardiness'

# Extract features (X) and target variable (y) from the dataset
X = data[features]
y = data[target]

# Initialize a OneHotEncoder to handle categorical variables
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Perform one-hot encoding on categorical features and create a DataFrame for the encoded features
X_encoded = pd.DataFrame(encoder.fit_transform(X.select_dtypes(include='object')))
X_encoded.columns = encoder.get_feature_names_out(X.select_dtypes(include='object').columns)

# Concatenate the encoded features with the original numerical features
X = pd.concat([X.select_dtypes(include=['float64', 'int64']), X_encoded], axis=1)

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display the shapes of the training, validation, and test sets
print('Training set shape:', X_train.shape, y_train.shape)
print('Validation set shape:', X_val.shape, y_val.shape)
print('Test set shape:', X_test.shape, y_test.shape)

# Import the Decision Tree Classifier and accuracy metric
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialize a Decision Tree Classifier model
model = DecisionTreeClassifier()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the training set and calculate training accuracy
train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)

# Make predictions on the validation set and calculate validation accuracy
val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)

# Make predictions on the test set and calculate test accuracy
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)

# Display the training, validation, and test accuracies
print('Training Accuracy:', train_accuracy)
print('Validation Accuracy:', val_accuracy)
print('Test Accuracy:', test_accuracy)

precision = precision_score(y_test, test_predictions, average='weighted')
recall = recall_score(y_test, test_predictions, average='weighted')
f1 = f1_score(y_test, test_predictions, average='weighted')

# Display precision, recall, and F1 score for the test set
print(' Precision is :', precision)
print(' Recall is :', recall)
print(' F1 Score is :', f1)

# Install joblib
!pip install joblib

# Import joblib for model saving
import joblib

# Save the trained model to a file
joblib.dump(model, 'saved_model.pkl')

# Load the saved model from the file
loaded_model = joblib.load('saved_model.pkl')

# Read new data from 'dataTEST.csv'  # for testing
new_data = pd.read_csv('dataTEST.csv')

# Perform one-hot encoding on the new data
new_data_encoded = pd.DataFrame(encoder.transform(new_data.select_dtypes(include='object')))
new_data_encoded.columns = encoder.get_feature_names_out(new_data.select_dtypes(include='object').columns)

# Concatenate the encoded features with the original numerical features in the new data
new_data = pd.concat([new_data.select_dtypes(include=['float64', 'int64']), new_data_encoded], axis=1)

# Make predictions on the new data using the loaded model
predictions = loaded_model.predict(new_data)

# Display the predictions
print(predictions)




Training set shape: (6999, 11) (6999,)
Validation set shape: (1500, 11) (1500,)
Test set shape: (1500, 11) (1500,)
Training Accuracy: 1.0
Validation Accuracy: 1.0
Test Accuracy: 1.0
 Precision is : 1.0
 Recall is : 1.0
 F1 Score is : 1.0
[0 1]


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, recall_score, f1_score

# Read the dataset from a CSV file into a pandas DataFrame
data = pd.read_csv('dataSET-1.csv')

# Define the features and target variable
features = ['weather', 'transportation', 'start', 'traffic', 'duration', 'previous tardiness']
target = 'tardiness'

# Extract features (X) and target variable (y) from the dataset
X = data[features]
y = data[target]

# Initialize a OneHotEncoder to handle categorical variables
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Perform one-hot encoding on categorical features and create a DataFrame for the encoded features
X_encoded = pd.DataFrame(encoder.fit_transform(X.select_dtypes(include='object')))
X_encoded.columns = encoder.get_feature_names_out(X.select_dtypes(include='object').columns)

# Concatenate the encoded features with the original numerical features
X = pd.concat([X.select_dtypes(include=['float64', 'int64']), X_encoded], axis=1)

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display the shapes of the training, validation, and test sets
print('Training set shape:', X_train.shape, y_train.shape)
print('Validation set shape:', X_val.shape, y_val.shape)
print('Test set shape:', X_test.shape, y_test.shape)

# Import the Gaussian Naive Bayes model and accuracy metric
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Initialize a Gaussian Naive Bayes model
model = GaussianNB()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the training set and calculate training accuracy
train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)

# Make predictions on the validation set and calculate validation accuracy
val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)

# Make predictions on the test set and calculate test accuracy
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)

# Display the training, validation, and test accuracies
print('Training Accuracy:', train_accuracy)
print('Validation Accuracy:', val_accuracy)
print('Test Accuracy:', test_accuracy)

precision = precision_score(y_test, test_predictions, average='weighted')
recall = recall_score(y_test, test_predictions, average='weighted')
f1 = f1_score(y_test, test_predictions, average='weighted')

# Display precision, recall, and F1 score for the test set
print(' Precision is :', precision)
print(' Recall is :', recall)
print(' F1 Score is :', f1)

# Install joblib
!pip install joblib

# Import joblib for model saving
import joblib

# Save the trained model to a file
joblib.dump(model, 'saved_model.pkl')

# Load the saved model from the file
loaded_model = joblib.load('saved_model.pkl')

# Read new data from 'dataTEST.csv'  # for testing
new_data = pd.read_csv('dataTEST.csv')

# Perform one-hot encoding on the new data
new_data_encoded = pd.DataFrame(encoder.transform(new_data.select_dtypes(include='object')))
new_data_encoded.columns = encoder.get_feature_names_out(new_data.select_dtypes(include='object').columns)

# Concatenate the encoded features with the original numerical features in the new data
new_data = pd.concat([new_data.select_dtypes(include=['float64', 'int64']), new_data_encoded], axis=1)

# Make predictions on the new data using the loaded model
predictions = loaded_model.predict(new_data)

# Display the predictions
print(predictions)


Training set shape: (6999, 11) (6999,)
Validation set shape: (1500, 11) (1500,)
Test set shape: (1500, 11) (1500,)
Training Accuracy: 0.9878554079154165
Validation Accuracy: 0.988
Test Accuracy: 0.9893333333333333
 Precision is : 0.9896823449216087
 Recall is : 0.9893333333333333
 F1 Score is : 0.9893801663065648




[0 1]


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

# Read the dataset from a CSV file into a pandas DataFrame
data = pd.read_csv('dataSET-1.csv')

# Define the features and target variable
features = ['weather', 'transportation', 'start', 'traffic', 'duration', 'previous tardiness']
target = 'tardiness'

# Extract features (X) and target variable (y) from the dataset
X = data[features]
y = data[target]

# Initialize a OneHotEncoder to handle categorical variables
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Perform one-hot encoding on categorical features and create a DataFrame for the encoded features
X_encoded = pd.DataFrame(encoder.fit_transform(X.select_dtypes(include='object')))
X_encoded.columns = encoder.get_feature_names_out(X.select_dtypes(include='object').columns)

# Concatenate the encoded features with the original numerical features
X = pd.concat([X.select_dtypes(include=['float64', 'int64']), X_encoded], axis=1)

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display the shapes of the training, validation, and test sets
print('Training set shape:', X_train.shape, y_train.shape)
print('Validation set shape:', X_val.shape, y_val.shape)
print('Test set shape:', X_test.shape, y_test.shape)

# Import the K-Nearest Neighbors (KNN) model and accuracy metric
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Initialize a K-Nearest Neighbors (KNN) model
model = KNeighborsClassifier()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the training set and calculate training accuracy
train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)

# Make predictions on the validation set and calculate validation accuracy
val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)

# Make predictions on the test set and calculate test accuracy
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)

# Display the training, validation, and test accuracies
print('Training Accuracy:', train_accuracy)
print('Validation Accuracy:', val_accuracy)
print('Test Accuracy:', test_accuracy)

precision = precision_score(y_test, test_predictions, average='weighted')
recall = recall_score(y_test, test_predictions, average='weighted')
f1 = f1_score(y_test, test_predictions, average='weighted')

# Display precision, recall, and F1 score for the test set
print(' Precision is :', precision)
print(' Recall is :', recall)
print(' F1 Score is :', f1)

# Install joblib
!pip install joblib

# Import joblib for model saving
import joblib

# Save the trained model to a file
joblib.dump(model, 'saved_model.pkl')

# Load the saved model from the file
loaded_model = joblib.load('saved_model.pkl')

# Read new data from 'dataTEST.csv'  # for testing
new_data = pd.read_csv('dataTEST.csv')

# Perform one-hot encoding on the new data
new_data_encoded = pd.DataFrame(encoder.transform(new_data.select_dtypes(include='object')))
new_data_encoded.columns = encoder.get_feature_names_out(new_data.select_dtypes(include='object').columns)

# Concatenate the encoded features with the original numerical features in the new data
new_data = pd.concat([new_data.select_dtypes(include=['float64', 'int64']), new_data_encoded], axis=1)

# Make predictions on the new data using the loaded model
predictions = loaded_model.predict(new_data)

# Display the predictions
print(predictions)


Training set shape: (6999, 11) (6999,)
Validation set shape: (1500, 11) (1500,)
Test set shape: (1500, 11) (1500,)




Training Accuracy: 1.0
Validation Accuracy: 1.0
Test Accuracy: 1.0
 Precision is : 1.0
 Recall is : 1.0
 F1 Score is : 1.0
[0 1]


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

# Read the dataset from a CSV file into a pandas DataFrame
data = pd.read_csv('dataSET-1.csv')

# Define the features and target variable
features = ['weather', 'transportation', 'start', 'traffic', 'duration', 'previous tardiness']
target = 'tardiness'

# Extract features (X) and target variable (y) from the dataset
X = data[features]
y = data[target]

# Initialize a OneHotEncoder to handle categorical variables
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Perform one-hot encoding on categorical features and create a DataFrame for the encoded features
X_encoded = pd.DataFrame(encoder.fit_transform(X.select_dtypes(include='object')))
X_encoded.columns = encoder.get_feature_names_out(X.select_dtypes(include='object').columns)

# Concatenate the encoded features with the original numerical features
X = pd.concat([X.select_dtypes(include=['float64', 'int64']), X_encoded], axis=1)

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display the shapes of the training, validation, and test sets
print('Training set shape:', X_train.shape, y_train.shape)
print('Validation set shape:', X_val.shape, y_val.shape)
print('Test set shape:', X_test.shape, y_test.shape)

# Import the Gradient Boosting Classifier and accuracy metric
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Initialize a Gradient Boosting Classifier model
model = GradientBoostingClassifier()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the training set and calculate training accuracy
train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)

# Make predictions on the validation set and calculate validation accuracy
val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)

# Make predictions on the test set and calculate test accuracy
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)

# Display the training, validation, and test accuracies
print('Training Accuracy:', train_accuracy)
print('Validation Accuracy:', val_accuracy)
print('Test Accuracy:', test_accuracy)

precision = precision_score(y_test, test_predictions, average='weighted')
recall = recall_score(y_test, test_predictions, average='weighted')
f1 = f1_score(y_test, test_predictions, average='weighted')

# Display precision, recall, and F1 score for the test set
print(' Precision is :', precision)
print(' Recall is :', recall)
print(' F1 Score is :', f1)

# Install joblib
!pip install joblib

# Import joblib for model saving
import joblib

# Save the trained model to a file
joblib.dump(model, 'saved_model.pkl')

# Load the saved model from the file
loaded_model = joblib.load('saved_model.pkl')

# Read new data from 'dataTEST.csv'  # for testing
new_data = pd.read_csv('dataTEST.csv')

# Perform one-hot encoding on the new data
new_data_encoded = pd.DataFrame(encoder.transform(new_data.select_dtypes(include='object')))
new_data_encoded.columns = encoder.get_feature_names_out(new_data.select_dtypes(include='object').columns)

# Concatenate the encoded features with the original numerical features in the new data
new_data = pd.concat([new_data.select_dtypes(include=['float64', 'int64']), new_data_encoded], axis=1)

# Make predictions on the new data using the loaded model
predictions = loaded_model.predict(new_data)

# Display the predictions
print(predictions)




Training set shape: (6999, 11) (6999,)
Validation set shape: (1500, 11) (1500,)
Test set shape: (1500, 11) (1500,)
Training Accuracy: 1.0
Validation Accuracy: 1.0
Test Accuracy: 1.0
 Precision is : 1.0
 Recall is : 1.0
 F1 Score is : 1.0
[0 1]
