In [4]:
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Initialize an empty list to hold the rows
all_arrays = []
n = 25000

# Loop through numbers 0 to 4999
for a in range(0, n):
    file_path = f'output/image_{a}.npy'

    # Check if the file exists
    if os.path.exists(file_path):
        try:
            # Load the .npy file
            data = np.load(file_path)
            # Append the 1D array to the list
            all_arrays.append(data)
            print(f"File found: {file_path}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    else:
        print(f"File not found: {file_path}")

# Convert the list of arrays into a 2D NumPy array
if all_arrays:
    x = np.vstack(all_arrays)
    # Print the resulting 2D array shape
    print(x.shape)
else:
    print("No arrays loaded.")
n=len(all_arrays)

# Example data to save
df=pd.read_csv('dataset/train.csv')  # A 5x3 array of random numbers

# Convert the NumPy array to a DataFrame
df = df.iloc[:n, :]
print(df)
# Save the DataFrame to a CSV file
# df.to_csv('output/data.csv', index=False)

y=df[['group_id','entity_name','entity_value']]
print(y)

le = LabelEncoder()

y = le.fit_transform(df['entity_name'])

print("Encoded labels:", y)
print(len(y))

xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = 0.1,random_state=0)

# Assuming xtrain, xtest, ytrain, and ytest are already defined and loaded correctly



File found: output/image_0.npy
File found: output/image_1.npy
File found: output/image_2.npy
File found: output/image_3.npy
File found: output/image_4.npy
File found: output/image_5.npy
File found: output/image_6.npy
File found: output/image_7.npy
File found: output/image_8.npy
File found: output/image_9.npy
File found: output/image_10.npy
File found: output/image_11.npy
File found: output/image_12.npy
File found: output/image_13.npy
File found: output/image_14.npy
File found: output/image_15.npy
File found: output/image_16.npy
File found: output/image_17.npy
File found: output/image_18.npy
File found: output/image_19.npy
File found: output/image_20.npy
File found: output/image_21.npy
File found: output/image_22.npy
File found: output/image_23.npy
File found: output/image_24.npy
File found: output/image_25.npy
File found: output/image_26.npy
File found: output/image_27.npy
File found: output/image_28.npy
File found: output/image_29.npy
File found: output/image_30.npy
File found: output

In [2]:
# Check original shapes
print("Original shapes:")
print(f"xtrain shape: {xtrain.shape}")
print(f"xtest shape: {xtest.shape}")
print(f"ytrain shape: {ytrain.shape}")
print(f"ytest shape: {ytest.shape}")

# Flatten ytrain and ytest to get class indices if they have extra dimensions
ytrain_flat = np.argmax(ytrain, axis=-1) if len(ytrain.shape) > 1 else ytrain  # Flatten to 1D array along the class axis
ytest_flat = np.argmax(ytest, axis=-1) if len(ytest.shape) > 1 else ytest

# Check shapes after flattening
print("After flattening:")
print(f"ytrain_flat shape: {ytrain_flat.shape}")
print(f"ytest_flat shape: {ytest_flat.shape}")

# One-hot encode the target labels
num_classes = 10  # Assuming 10 classes
ytrain = to_categorical(ytrain_flat, num_classes=num_classes)
ytest = to_categorical(ytest_flat, num_classes=num_classes)

# Check shapes after one-hot encoding
print("After one-hot encoding:")
print(f"ytrain shape: {ytrain.shape}")
print(f"ytest shape: {ytest.shape}")

# Ensure xtest and ytest have the same number of samples
print(f"xtest shape: {xtest.shape}")
print(f"ytest shape: {ytest.shape}")

# Ensure xtrain and ytrain have the same number of samples
if len(xtrain) != len(ytrain):
    print("Mismatch in number of samples between xtrain and ytrain")
    min_samples = min(len(xtrain), len(ytrain))
    xtrain = xtrain[:min_samples]
    ytrain = ytrain[:min_samples]

# Ensure xtest and ytest have the same number of samples
if len(xtest) != len(ytest):
    print("Mismatch in number of samples between xtest and ytest")
    min_samples = min(len(xtest), len(ytest))
    xtest = xtest[:min_samples]
    ytest = ytest[:min_samples]

# Build the fully connected model
model = Sequential([
    Input(shape=(xtrain.shape[1],)),  # Use Input layer instead of input_shape in Dense
    Dense(512, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')  # Number of classes in the output layer
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(xtrain, ytrain, epochs=100, batch_size=64, validation_split=0.1)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(xtest, ytest)
print(f'Test loss: {test_loss:.4f}')
print(f'Test accuracy: {test_accuracy:.4f}')

# Make predictions
predictions = model.predict(xtest)

# Convert predictions to class labels
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(ytest, axis=1)

# Calculate the F1 score
f1 = f1_score(true_classes, predicted_classes, average='weighted')
print(f'F1 Score: {f1:.4f}')


Original shapes:
xtrain shape: (22, 100352)
xtest shape: (3, 100352)
ytrain shape: (22,)
ytest shape: (3,)
After flattening:
ytrain_flat shape: (22,)
ytest_flat shape: (3,)
After one-hot encoding:
ytrain shape: (22, 10)
ytest shape: (3, 10)
xtest shape: (3, 100352)
ytest shape: (3, 10)
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 2.2884 - val_accuracy: 0.6667 - val_loss: 7.7571
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 540ms/step - accuracy: 0.9474 - loss: 1.3098 - val_accuracy: 0.6667 - val_loss: 2.2396
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 523ms/step - accuracy: 0.9474 - loss: 0.3323 - val_accuracy: 0.3333 - val_loss: 16.2952
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 535ms/step - accuracy: 0.0526 - loss: 24.1320 - val_accuracy: 0.6667 - val_loss: 4.2689
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[

In [3]:
# Check original shapes
print("Original shapes:")
print(f"xtrain shape: {xtrain.shape}")
print(f"xtest shape: {xtest.shape}")
print(f"ytrain shape: {ytrain.shape}")
print(f"ytest shape: {ytest.shape}")

# Flatten ytrain and ytest to get class indices if they have extra dimensions
ytrain_flat = np.argmax(ytrain, axis=-1) if len(ytrain.shape) > 1 else ytrain  # Flatten to 1D array along the class axis
ytest_flat = np.argmax(ytest, axis=-1) if len(ytest.shape) > 1 else ytest

# Check shapes after flattening
print("After flattening:")
print(f"ytrain_flat shape: {ytrain_flat.shape}")
print(f"ytest_flat shape: {ytest_flat.shape}")

# One-hot encode the target labels
num_classes = 10  # Assuming 10 classes
ytrain = to_categorical(ytrain_flat, num_classes=num_classes)
ytest = to_categorical(ytest_flat, num_classes=num_classes)

# Check shapes after one-hot encoding
print("After one-hot encoding:")
print(f"ytrain shape: {ytrain.shape}")
print(f"ytest shape: {ytest.shape}")

# Ensure xtest and ytest have the same number of samples
print(f"xtest shape: {xtest.shape}")
print(f"ytest shape: {ytest.shape}")

# Ensure xtrain and ytrain have the same number of samples
if len(xtrain) != len(ytrain):
    print("Mismatch in number of samples between xtrain and ytrain")
    min_samples = min(len(xtrain), len(ytrain))
    xtrain = xtrain[:min_samples]
    ytrain = ytrain[:min_samples]

# Ensure xtest and ytest have the same number of samples
if len(xtest) != len(ytest):
    print("Mismatch in number of samples between xtest and ytest")
    min_samples = min(len(xtest), len(ytest))
    xtest = xtest[:min_samples]
    ytest = ytest[:min_samples]

# Build the fully connected model
model = Sequential([
    Input(shape=(xtrain.shape[1],)),  # Use Input layer instead of input_shape in Dense
    Dense(512, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')  # Number of classes in the output layer
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(xtrain, ytrain, epochs=250, batch_size=256, validation_split=0.1)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(xtest, ytest)
print(f'Test loss: {test_loss:.4f}')
print(f'Test accuracy: {test_accuracy:.4f}')

# Make predictions
predictions = model.predict(xtest)

# Convert predictions to class labels
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(ytest, axis=1)

# Calculate the F1 score
f1 = f1_score(true_classes, predicted_classes, average='weighted')
print(f'F1 Score: {f1:.4f}')


Original shapes:
xtrain shape: (22, 100352)
xtest shape: (3, 100352)
ytrain shape: (22, 10)
ytest shape: (3, 10)
After flattening:
ytrain_flat shape: (22,)
ytest_flat shape: (3,)
After one-hot encoding:
ytrain shape: (22, 10)
ytest shape: (3, 10)
xtest shape: (3, 100352)
ytest shape: (3, 10)
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 2.8633 - val_accuracy: 0.6667 - val_loss: 19.7147
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 505ms/step - accuracy: 0.9474 - loss: 3.3144 - val_accuracy: 0.6667 - val_loss: 18.1678
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 509ms/step - accuracy: 0.9474 - loss: 3.0111 - val_accuracy: 0.6667 - val_loss: 10.0868
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 527ms/step - accuracy: 0.9474 - loss: 1.6175 - val_accuracy: 0.3333 - val_loss: 2.3341
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 