# Project Part 3

[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/ciscoyslas/deeplearningmodelyslas/edit)


_This notebook will build a deep learning model to predict ABV (Alcohol by Volume) from a dataset of craft beers._

## 1. Introduction/Background



## 2. Exploratory Data Analysis



In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Embedding, Flatten, Concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler



In [8]:
beers_df = pd.read_csv('/kaggle/input/craft-cans/beers.csv')
test_df = pd.read_csv('/kaggle/input/craft-cans/beers.csv')

In [9]:
# Loading pre-trained Word2Vec embeddings
word2vec_path = '/kaggle/input/word-embeddings/GoogleNews-vectors-negative300.bin'
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)


# Tokenize and pad sequences for the 'style' column
max_sequence_length_style = 50
text_data_style = pad_sequences(beers_df['style'].apply(lambda x: [word2vec_model.key_to_index[word] for word in str(x).split() if word in word2vec_model.key_to_index]).tolist(), maxlen=max_sequence_length_style)

# Tokenize and pad sequences for the 'name' column
max_sequence_length_name = 20 
text_data_name = pad_sequences(beers_df['name'].apply(lambda x: [word2vec_model.key_to_index[word] for word in str(x).split() if word in word2vec_model.key_to_index]).tolist(), maxlen=max_sequence_length_name)

# Normalize numerical features
numerical_data = MinMaxScaler().fit_transform(beers_df[['ibu', 'ounces']].values)

In [10]:
# Concatenate text and numerical features
X_text = np.concatenate([text_data_style, text_data_name], axis=1)
X_num = numerical_data
y = beers_df['abv'].values

#Define input layers
input_text = Input(shape=(max_sequence_length_style + max_sequence_length_name,))
input_num = Input(shape=(2,)) 

# Word embedding layer for text data
embedding_layer = Embedding(input_dim=len(word2vec_model.key_to_index), output_dim=300, input_length=max_sequence_length_style + max_sequence_length_name)(input_text)
flatten_layer = Flatten()(embedding_layer)

# Concatenate the flattened text data and numerical data
concatenated_layer = Concatenate()([flatten_layer, input_num])

# Dense layers for the combined data
dense1 = Dense(128, activation='relu')(concatenated_layer)
output_layer = Dense(1)(dense1)

In [1]:
# Define the model
model = Model(inputs=[input_text, input_num], outputs=output_layer)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Split the data into training and validation sets
X_train_text, X_val_text, X_train_num, X_val_num, y_train, y_val = train_test_split(X_text, X_num, y, test_size=0.2, random_state=42)

# Train the model
model.fit([X_train_text, X_train_num], y_train, epochs=10, batch_size = 16, validation_data=([X_val_text, X_val_num], y_val))


NameError: name 'Model' is not defined

In [None]:
# Evaluate the model on a test set
X_test_text_style = pad_sequences(test_df['style'].apply(lambda x: [word2vec_model.key_to_index[word] for word in str(x).split() if word in word2vec_model.key_to_index]).tolist(), maxlen=max_sequence_length_style)
X_test_text_name = pad_sequences(test_df['name'].apply(lambda x: [word2vec_model.key_to_index[word] for word in str(x).split() if word in word2vec_model.key_to_index]).tolist(), maxlen=max_sequence_length_name)
X_test_num = MinMaxScaler().fit_transform(test_df[['ibu', 'ounces']].values)
model.evaluate([np.concatenate([X_test_text_style, X_test_text_name], axis=1), X_test_num], test_df['abv'].values)
