### Naive Bayes

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Load the formatted data
data = pd.read_csv('all_letters_with_label.csv')

# Initialize LabelEncoder for the label column
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'].astype(str))

# Split the data into features (X) and target (y)
X = data.iloc[:, :-1].values  # All columns except the last one
y = data['Label'].values      # The last column is the target

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Naive Bayes model
nb_model = GaussianNB()

# Train the model
nb_model.fit(X_train, y_train)

# Predict on the test data
y_pred = nb_model.predict(X_test)

# Display the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Naive Bayes Accuracy: {accuracy:.4f}')

# Print the classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.inverse_transform(range(len(label_encoder.classes_)))))


Naive Bayes Accuracy: 0.8406
              precision    recall  f1-score   support

           0       1.00      0.03      0.07        89
           1       1.00      1.00      1.00       130
          10       1.00      1.00      1.00       157
          11       0.75      0.95      0.84       123
          12       0.34      0.38      0.36       124
          13       0.94      1.00      0.97       167
          14       0.58      0.99      0.73       131
          15       0.99      0.97      0.98       130
          16       0.54      0.44      0.49       111
          17       0.51      1.00      0.68       137
          18       0.79      1.00      0.88       143
          19       0.94      0.97      0.96       187
           2       1.00      0.91      0.95       125
          20       0.64      0.92      0.76       134
          21       1.00      1.00      1.00       174
          22       0.99      1.00      1.00       133
          23       1.00      1.00      1.00       12

### Random Forrest Classifier

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the formatted data
data = pd.read_csv('all_letters_with_label.csv')

# Initialize LabelEncoder for the label column
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'].astype(str))

# Split the data into features (X) and target (y)
X = data.iloc[:, :-1].values  # All columns except the last one
y = data['Label'].values      # The last column is the target

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test data
y_pred = rf_model.predict(X_test)

# Display the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {accuracy:.4f}')

# Print the classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.inverse_transform(range(len(label_encoder.classes_)))))


Random Forest Accuracy: 0.9545
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        89
           1       1.00      1.00      1.00       130
          10       1.00      1.00      1.00       157
          11       0.99      0.94      0.97       123
          12       0.94      0.99      0.96       124
          13       0.94      1.00      0.97       167
          14       1.00      0.98      0.99       131
          15       0.99      0.97      0.98       130
          16       1.00      0.32      0.48       111
          17       1.00      0.77      0.87       137
          18       1.00      0.99      1.00       143
          19       0.94      0.97      0.96       187
           2       1.00      0.91      0.95       125
          20       0.64      0.92      0.76       134
          21       1.00      1.00      1.00       174
          22       1.00      1.00      1.00       133
          23       1.00      1.00      1.00       

### Support Vector Machine (SVM)

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the formatted data
data = pd.read_csv('all_letters_with_label.csv')

# Initialize LabelEncoder for the label column
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'].astype(str))

# Split the data into features (X) and target (y)
X = data.iloc[:, :-1].values  # All columns except the last one
y = data['Label'].values      # The last column is the target

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the SVM model
svm_model = SVC(kernel='linear', probability=True)

# Train the model
svm_model.fit(X_train, y_train)

# Predict on the test data
y_pred = svm_model.predict(X_test)

# Display the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'SVM Accuracy: {accuracy:.4f}')

# Print the classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.inverse_transform(range(len(label_encoder.classes_)))))


SVM Accuracy: 0.9515
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        89
           1       1.00      1.00      1.00       130
          10       1.00      1.00      1.00       157
          11       0.94      0.95      0.94       123
          12       0.94      0.94      0.94       124
          13       0.94      1.00      0.97       167
          14       1.00      0.97      0.98       131
          15       0.99      0.97      0.98       130
          16       1.00      0.29      0.45       111
          17       1.00      0.77      0.87       137
          18       1.00      0.99      1.00       143
          19       0.94      0.97      0.96       187
           2       1.00      0.91      0.95       125
          20       0.63      0.92      0.75       134
          21       1.00      1.00      1.00       174
          22       0.99      1.00      1.00       133
          23       1.00      1.00      1.00       128
      

### K-Nearest Neighbors (KNN)

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the formatted data
data = pd.read_csv('all_letters_with_label.csv')

# Initialize LabelEncoder for the label column
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'].astype(str))

# Split the data into features (X) and target (y)
X = data.iloc[:, :-1].values  # All columns except the last one
y = data['Label'].values      # The last column is the target

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model (for KNN, it's more about storing the training data)
knn_model.fit(X_train, y_train)

# Predict on the test data
y_pred = knn_model.predict(X_test)

# Display the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'KNN Accuracy: {accuracy:.4f}')

# Print the classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.inverse_transform(range(len(label_encoder.classes_)))))


KNN Accuracy: 0.9536
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        89
           1       1.00      1.00      1.00       130
          10       1.00      0.99      1.00       157
          11       0.99      0.94      0.97       123
          12       0.94      0.99      0.96       124
          13       0.94      1.00      0.97       167
          14       1.00      0.97      0.98       131
          15       0.99      0.97      0.98       130
          16       1.00      0.32      0.48       111
          17       1.00      0.77      0.87       137
          18       1.00      0.99      1.00       143
          19       0.94      0.97      0.95       187
           2       1.00      0.91      0.95       125
          20       0.64      0.91      0.75       134
          21       1.00      1.00      1.00       174
          22       0.99      1.00      1.00       133
          23       1.00      1.00      1.00       128
      

### Neural Network (MLP Classifier) with Scikit-learn

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the formatted data
data = pd.read_csv('all_letters_with_label.csv')

# Initialize LabelEncoder for the label column
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'].astype(str))

# Split the data into features (X) and target (y)
X = data.iloc[:, :-1].values  # All columns except the last one
y = data['Label'].values      # The last column is the target

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the MLP (Neural Network) model
mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)

# Train the model
mlp_model.fit(X_train, y_train)

# Predict on the test data
y_pred = mlp_model.predict(X_test)

# Display the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Neural Network (MLP) Accuracy: {accuracy:.4f}')

# Print the classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.inverse_transform(range(len(label_encoder.classes_)))))


Neural Network (MLP) Accuracy: 0.9515
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        89
           1       1.00      1.00      1.00       130
          10       1.00      1.00      1.00       157
          11       0.99      0.94      0.97       123
          12       0.94      0.99      0.96       124
          13       0.94      1.00      0.97       167
          14       1.00      0.98      0.99       131
          15       0.99      0.97      0.98       130
          16       0.67      0.44      0.53       111
          17       1.00      0.77      0.87       137
          18       1.00      0.99      1.00       143
          19       0.94      0.97      0.96       187
           2       1.00      0.91      0.95       125
          20       0.64      0.92      0.76       134
          21       1.00      1.00      1.00       174
          22       1.00      1.00      1.00       133
          23       1.00      1.00      1.00

### Logistic Regression

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the formatted data
data = pd.read_csv('all_letters_with_label.csv')

# Initialize LabelEncoder for the label column
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'].astype(str))

# Split the data into features (X) and target (y)
X = data.iloc[:, :-1].values  # All columns except the last one
y = data['Label'].values      # The last column is the target

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Logistic Regression model
lr_model = LogisticRegression(multi_class='ovr', solver='liblinear')

# Train the model
lr_model.fit(X_train, y_train)

# Predict on the test data
y_pred = lr_model.predict(X_test)

# Display the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic Regression Accuracy: {accuracy:.4f}')

# Print the classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.inverse_transform(range(len(label_encoder.classes_)))))




Logistic Regression Accuracy: 0.9437
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        89
           1       1.00      1.00      1.00       130
          10       0.99      1.00      1.00       157
          11       0.93      0.95      0.94       123
          12       0.94      0.92      0.93       124
          13       0.94      1.00      0.97       167
          14       1.00      0.97      0.98       131
          15       0.93      0.87      0.90       130
          16       1.00      0.29      0.45       111
          17       1.00      0.77      0.87       137
          18       1.00      0.99      1.00       143
          19       0.94      0.97      0.96       187
           2       0.99      0.91      0.95       125
          20       0.63      0.92      0.75       134
          21       1.00      1.00      1.00       174
          22       0.99      1.00      1.00       133
          23       1.00      1.00      1.00 

### Gradient Boosting Machines (XGBoost):

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the formatted data
data = pd.read_csv('all_letters_with_label.csv')

# Initialize LabelEncoder for the label column
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'].astype(str))

# Split the data into features (X) and target (y)
X = data.iloc[:, :-1].values  # All columns except the last one
y = data['Label'].values      # The last column is the target

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the test data
y_pred = xgb_model.predict(X_test)

# Display the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'XGBoost Accuracy: {accuracy:.4f}')

# Print the classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.inverse_transform(range(len(label_encoder.classes_)))))


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9545
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        89
           1       1.00      1.00      1.00       130
          10       1.00      1.00      1.00       157
          11       0.99      0.94      0.97       123
          12       0.94      0.99      0.96       124
          13       0.94      1.00      0.97       167
          14       1.00      0.98      0.99       131
          15       0.99      0.97      0.98       130
          16       1.00      0.32      0.48       111
          17       1.00      0.77      0.87       137
          18       1.00      0.99      1.00       143
          19       0.94      0.97      0.96       187
           2       1.00      0.91      0.95       125
          20       0.64      0.92      0.76       134
          21       1.00      1.00      1.00       174
          22       1.00      1.00      1.00       133
          23       1.00      1.00      1.00       128
  

### Previous Model Accuracies
- > Logistic Regression Accuracy: 0.8830
- > Naive Bayes Accuracy: 0.7810
- > XGBoost Accuracy: 0.9370
- > SVM Accuracy: 0.9240
- > KNN Accuracy: 0.9070
- > Neural Network (MLP) Accuracy: 0.9360
- > Random Forest Accuracy: 0.9390

### Current Model Accuracies (with new and larger data)
- > Logistic Regression Accuracy: 0.9437
- > Naive Bayes Accuracy: 0.8406
- > XGBoost Accuracy: 0.9545
- > SVM Accuracy: 0.9515
- > KNN Accuracy: 0.9536
- > Neural Network (MLP) Accuracy: 0.9515
- > Random Forest Accuracy: 0.9545
