<a href="https://colab.research.google.com/github/AndresS0103/neural-network-data-warehouse-project/blob/main/ML-Warehouse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

# Load the dataset
file_path = 'https://docs.google.com/spreadsheets/d/1yhA6DEStLdrOy4ZWppEMynCUgy6XfPBXkY1cG_NQQ4I/export?format=csv'
laptop_data = pd.read_csv(file_path)

# Verify columns
expected_columns = ['Company', 'Product', 'TypeName', 'Ram', 'PrimaryStorageType', 'CPU_model', 'GPU_model', 'Price_euros']
missing_columns = [col for col in expected_columns if col not in laptop_data.columns]
if missing_columns:
    raise ValueError(f"Missing expected columns: {missing_columns}")

# Define price categories
def categorize_price(price):
    if price < 700:
        return 'Barata'
    elif 700 <= price < 1500:
        return 'Moderada'
    else:
        return 'Cara'

laptop_data['Price_Category'] = laptop_data['Price_euros'].apply(categorize_price)

# Generate textual representation for each laptop
laptop_data['Description'] = (
    laptop_data['Company'] + " " + laptop_data['Product'] + " " +
    laptop_data['TypeName'] + ", " + laptop_data['Ram'].astype(str) + "GB RAM, " +
    laptop_data['PrimaryStorageType'] + ", " + laptop_data['CPU_model'] + ", " +
    laptop_data['GPU_model']
)

# Normalize text
laptop_data['Description'] = laptop_data['Description'].str.lower().str.replace('[^a-z0-9 ]', '', regex=True)

# Split data
X = laptop_data['Description']
y = laptop_data['Price_Category']

# Map categories
category_to_id = {'Barata': 0, 'Moderada': 1, 'Cara': 2}
y = y.map(category_to_id)
y = to_categorical(y, num_classes=3)

# Tokenize text
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
maxlen = 50
X_padded = pad_sequences(X_seq, padding='post', maxlen=maxlen)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Define model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32, input_length=maxlen),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Classification report
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test.argmax(axis=1)
print(classification_report(y_test_classes, y_pred_classes, target_names=['Barata', 'Moderada', 'Cara']))

# Prediction function
def predict_laptop(description):
    seq = tokenizer.texts_to_sequences([description])
    padded = pad_sequences(seq, padding='post', maxlen=maxlen)
    pred_prob = model.predict(padded)
    category_id = pred_prob.argmax(axis=-1)[0]
    id_to_category = {0: 'Barata', 1: 'Moderada', 2: 'Cara'}
    return id_to_category[category_id], pred_prob[0]





Epoch 1/15




[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.4260 - loss: 1.0856 - val_accuracy: 0.4313 - val_loss: 1.0552
Epoch 2/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4520 - loss: 1.0582 - val_accuracy: 0.4313 - val_loss: 1.0499
Epoch 3/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4242 - loss: 1.0623 - val_accuracy: 0.4313 - val_loss: 1.0366
Epoch 4/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4600 - loss: 1.0320 - val_accuracy: 0.4313 - val_loss: 1.0156
Epoch 5/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4477 - loss: 1.0116 - val_accuracy: 0.4313 - val_loss: 0.9900
Epoch 6/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4665 - loss: 0.9778 - val_accuracy: 0.5000 - val_loss: 0.9505
Epoch 7/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━

In [14]:
# Example prediction

#example_description = "Razer Blade Pro Gaming, 32GB RAM, SSD, Core i7 7820HK, GeForce GTX 1080"

example_description = "Asus Vivobook E200HA Netbook, 2GB RAM, Flash Storage, Atom x5-Z8350, HD Graphics 400"

category, probabilities = predict_laptop(example_description)
print(f"Predicted Category: {category}")
print(f"Probabilities: {probabilities}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
Predicted Category: Barata
Probabilities: [0.7886081  0.18725556 0.02413639]
