In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder # Import OneHotEncoder
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/Asad-Shaikh786/Data-Science/refs/heads/main/Alphabets_data.csv')

# Data Exploration
print("Dataset Overview:\n")
print(df.info())
print("\nSummary Statistics:\n", df.describe())
print("\nChecking for Missing Values:\n", df.isnull().sum())

# Data Preprocessing
# Assuming the last column is the target
y = df.iloc[:, -1]
X = df.iloc[:, :-1]

# --- Changes here ---
# Create a OneHotEncoder object
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # sparse=False for dense output

# Fit and transform the categorical features in X
X_encoded = encoder.fit_transform(X[['letter']]) # Assuming 'letter' is the categorical column

# Create a DataFrame from the encoded features
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(['letter']))

# Drop the original 'letter' column and concatenate the encoded features
X = X.drop('letter', axis=1)
X = pd.concat([X, X_encoded_df], axis=1)
# --- End of changes ---


# Encode categorical target labels
y = pd.get_dummies(y)

# Normalize feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Basic ANN Model
def create_model():
    model = keras.models.Sequential([
        layers.Input(shape=(X_train.shape[1],)),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(y_train.shape[1], activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Train the model
model = create_model()
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred = np.argmax(model.predict(X_test), axis=1)
y_test_labels = np.argmax(y_test.values, axis=1)

# Class Distribution Check
print("\nClass Distribution:\n", y.sum())

# Updated Evaluation Step
print("\nClassification Report:\n",
      classification_report(y_test_labels, y_pred, zero_division=0))

Dataset Overview:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   letter  20000 non-null  object
 1   xbox    20000 non-null  int64 
 2   ybox    20000 non-null  int64 
 3   width   20000 non-null  int64 
 4   height  20000 non-null  int64 
 5   onpix   20000 non-null  int64 
 6   xbar    20000 non-null  int64 
 7   ybar    20000 non-null  int64 
 8   x2bar   20000 non-null  int64 
 9   y2bar   20000 non-null  int64 
 10  xybar   20000 non-null  int64 
 11  x2ybar  20000 non-null  int64 
 12  xy2bar  20000 non-null  int64 
 13  xedge   20000 non-null  int64 
 14  xedgey  20000 non-null  int64 
 15  yedge   20000 non-null  int64 
 16  yedgex  20000 non-null  int64 
dtypes: int64(16), object(1)
memory usage: 2.6+ MB
None

Summary Statistics:
                xbox          ybox         width       height         onpix  \
count  20000.000000  20000.00000