<a href="https://colab.research.google.com/github/Akshitkt001/Music_GenAI/blob/main/text_to_music.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install tensorflow



In [3]:
!pip install keras



In [6]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, RepeatVector, Concatenate
from tensorflow.keras.models import Model

# Hyperparameters
text_sequence_length = 50  # Length of input text sequences
music_sequence_length = 100  # Length of music sequences
embedding_dim = 128  # Dimension of word embeddings
lstm_units = 256  # Number of LSTM units
output_dim = 128  # Number of units in the Dense output layer

# Define vocabulary size and music feature dimensions
vocab_size = 10000  # Replace this with the actual size of your vocabulary
music_feature_dim = 128  # Replace this with the actual dimension of your music features

# Define input layers
text_input = Input(shape=(text_sequence_length,), name='text_input')
music_features_input = Input(shape=(music_sequence_length, music_feature_dim), name='music_features_input')

# Text embedding layer
text_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(text_input)

# LSTM layers
lstm_layer1 = LSTM(lstm_units, return_sequences=True)(text_embedding)
lstm_layer2 = LSTM(lstm_units, return_sequences=True)(lstm_layer1)
lstm_layer3 = LSTM(lstm_units, return_sequences=True)(lstm_layer2)
lstm_layer4 = LSTM(lstm_units)(lstm_layer3)  # Last layer without return sequences

# RepeatVector to match time steps for music generation
repeat_layer = RepeatVector(music_sequence_length)(lstm_layer4)

# Concatenate text features with music features
concatenated = Concatenate(axis=-1)([repeat_layer, music_features_input])

# LSTM layer for music generation
music_lstm = LSTM(lstm_units, return_sequences=True)(concatenated)

# Output layer for music generation
music_output = Dense(output_dim)(music_lstm)

# Define the model
model = Model(inputs=[text_input, music_features_input], outputs=music_output)

# Compile the model (specify optimizer and loss function)
model.compile(optimizer='adam', loss='mean_squared_error')

# Print model summary
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_input (InputLayer)        [(None, 50)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 50, 128)      1280000     ['text_input[0][0]']             
                                                                                                  
 lstm_9 (LSTM)                  (None, 50, 256)      394240      ['embedding_2[0][0]']            
                                                                                                  
 lstm_10 (LSTM)                 (None, 50, 256)      525312      ['lstm_9[0][0]']                 
                                                                                              

In [7]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, RepeatVector, Concatenate
from tensorflow.keras.models import Model
import numpy as np

# Hyperparameters
text_sequence_length = 50
music_sequence_length = 100
embedding_dim = 128
lstm_units = 256
output_dim = 128
vocab_size = 10000
music_feature_dim = 128

# Define input layers
text_input = Input(shape=(text_sequence_length,), name='text_input')
music_features_input = Input(shape=(music_sequence_length, music_feature_dim), name='music_features_input')

# Text embedding layer
text_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(text_input)

# LSTM layers for text analysis
lstm_layer1 = LSTM(lstm_units, return_sequences=True)(text_embedding)
lstm_layer2 = LSTM(lstm_units, return_sequences=True)(lstm_layer1)
lstm_layer3 = LSTM(lstm_units, return_sequences=True)(lstm_layer2)
lstm_layer4 = LSTM(lstm_units)(lstm_layer3)

# RepeatVector to match time steps for music generation
repeat_layer = RepeatVector(music_sequence_length)(lstm_layer4)

# Concatenate text features with music features
concatenated = Concatenate(axis=-1)([repeat_layer, music_features_input])

# LSTM layer for music generation
music_lstm = LSTM(lstm_units, return_sequences=True)(concatenated)

# Output layer for music generation
music_output = Dense(output_dim)(music_lstm)

# Define the model
model = Model(inputs=[text_input, music_features_input], outputs=music_output)

# Compile the model (specify optimizer and loss function)
model.compile(optimizer='adam', loss='mean_squared_error')

# Print model summary
model.summary()

# Generate some dummy data for demonstration
num_samples = 1000
dummy_text_data = np.random.randint(0, vocab_size, size=(num_samples, text_sequence_length))
dummy_music_features = np.random.random((num_samples, music_sequence_length, music_feature_dim))
dummy_music_labels = np.random.random((num_samples, music_sequence_length, output_dim))

# Train the model
model.fit(x=[dummy_text_data, dummy_music_features], y=dummy_music_labels, epochs=10)

# Save the trained model
model.save('text_to_music_model.h5')

# Load the trained model
loaded_model = tf.keras.models.load_model('text_to_music_model.h5')

# Now you can use the loaded model for text-to-music generation


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_input (InputLayer)        [(None, 50)]         0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 50, 128)      1280000     ['text_input[0][0]']             
                                                                                                  
 lstm_14 (LSTM)                 (None, 50, 256)      394240      ['embedding_3[0][0]']            
                                                                                                  
 lstm_15 (LSTM)                 (None, 50, 256)      525312      ['lstm_14[0][0]']                
                                                                                            

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define your preprocess_text function
def preprocess_text(input_text, tokenizer, max_seq_length):
    # Tokenize the input text
    tokenized_text = tokenizer.texts_to_sequences([input_text])

    # Pad the tokenized sequence
    padded_text = pad_sequences(tokenized_text, maxlen=max_seq_length, padding='post')

    return padded_text

# Load the trained model
loaded_model = tf.keras.models.load_model('text_to_music_model.h5')

# Load or create a tokenizer based on your training data
# tokenizer = ...  # Load your tokenizer
max_seq_length = 50  # Maximum sequence length used in training

# Get user input
user_input = input("Enter a text: ")

# Preprocess the text input using the defined function
processed_text = preprocess_text(user_input, tokenizer, max_seq_length)

# Generate music features using the loaded model
dummy_music_features = np.random.random((1, music_sequence_length, music_feature_dim))  # You might replace this with real music features

generated_music = loaded_model.predict([processed_text, dummy_music_features])

# Print or visualize the generated music features
print(generated_music)



Enter a text: love is in the air


NameError: ignored

In [10]:
# Sample texts for demonstration
train_texts = [
    "Once upon a time in a land far away...",
    "Roses are red, violets are blue...",
    "It was the best of times, it was the worst of times...",
    "To be or not to be, that is the question...",
    # Add more sample texts here
]

# Assuming you have corresponding music features for each text
# Create dummy music features for demonstration
num_samples = len(train_texts)
music_sequence_length = 100
music_feature_dim = 128
train_music_features = np.random.random((num_samples, music_sequence_length, music_feature_dim))

# Rest of your code...
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Generate dummy data for demonstration
num_samples = 1000
train_texts = [f"sample text {i}" for i in range(num_samples)]
train_music_features = np.random.random((num_samples, music_sequence_length, music_feature_dim))
train_music_labels = np.random.random((num_samples, music_sequence_length, output_dim))

# Load or create a tokenizer based on your training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

# Define your preprocess_text function
def preprocess_text(input_text, tokenizer, max_seq_length):
    tokenized_text = tokenizer.texts_to_sequences([input_text])
    padded_text = pad_sequences(tokenized_text, maxlen=max_seq_length, padding='post')
    return padded_text

# Load the trained model
loaded_model = tf.keras.models.load_model('text_to_music_model.h5')

# Maximum sequence length used in training
max_seq_length = 50

# Get user input
user_input = input("Enter a text: ")

# Preprocess the text input using the defined function
processed_text = preprocess_text(user_input, tokenizer, max_seq_length)

# Generate music features using the loaded model
dummy_music_features = np.random.random((1, music_sequence_length, music_feature_dim))

generated_music = loaded_model.predict([processed_text, dummy_music_features])

# Print or visualize the generated music features
print(generated_music)


Enter a text: Roses are red, violets are blue you are beautiful just like you.
[[[0.43315443 0.43535122 0.5254298  ... 0.42742342 0.48194125 0.41093925]
  [0.49938145 0.48479247 0.50733525 ... 0.47366348 0.4932505  0.4884351 ]
  [0.5022191  0.49300835 0.50166124 ... 0.495313   0.4740264  0.5034019 ]
  ...
  [0.5000275  0.49020535 0.4987788  ... 0.49359336 0.49113816 0.51012945]
  [0.49886018 0.49719608 0.5035875  ... 0.4929995  0.47955528 0.50978965]
  [0.49083853 0.4962536  0.51700926 ... 0.48702016 0.4895042  0.5014919 ]]]


In [4]:
!pip install pydub


Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [12]:
from pydub import AudioSegment

# Assuming `generated_music` contains the generated music features

# Scale the generated music features to the desired audio range
scaled_music = (generated_music - generated_music.min()) / (generated_music.max() - generated_music.min())
scaled_music = scaled_music * 2 - 1  # Map to the range [-1, 1]

# Convert scaled music to audio
audio_data = scaled_music.flatten()
audio_data = (audio_data * 32767).astype(np.int16)  # Convert to 16-bit PCM format

# Create an AudioSegment object
audio_segment = AudioSegment(
    audio_data.tobytes(),
    frame_rate=44100,  # Adjust the frame rate as needed
    sample_width=2,    # 16-bit audio
    channels=1          # Mono audio
)

# Save the audio segment as a WAV file
audio_filename = 'generated_music.wav'
audio_segment.export(audio_filename, format='wav')

print(f"Generated music saved as {audio_filename}")


Generated music saved as generated_music.wav


In [1]:
import numpy as np
from pydub import AudioSegment

# Assuming `loaded_model` and `preprocess_text` function are defined

# User input
user_input = input("Enter a text: ")

# Preprocess the text input
processed_text = preprocess_text(user_input, tokenizer, max_seq_length)

# Generate music features using the loaded model
dummy_music_features = np.random.random((1, music_sequence_length, music_feature_dim))  # Replace with real music features

# Generate multiple segments of music
num_segments = 5  # Number of segments to generate
generated_segments = []

for _ in range(num_segments):
    generated_music = loaded_model.predict([processed_text, dummy_music_features])
    generated_segments.append(generated_music)

# Rest of your code...


ModuleNotFoundError: ignored

In [5]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, RepeatVector, Concatenate
from tensorflow.keras.models import Model
import numpy as np
from pydub import AudioSegment

# Hyperparameters
text_sequence_length = 50
music_sequence_length = 100
embedding_dim = 128
lstm_units = 256
output_dim = 128
vocab_size = 10000
music_feature_dim = 128

# Define input layers
text_input = Input(shape=(text_sequence_length,), name='text_input')
music_features_input = Input(shape=(music_sequence_length, music_feature_dim), name='music_features_input')

# Text embedding layer
text_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(text_input)

# LSTM layers for text analysis
lstm_layer1 = LSTM(lstm_units, return_sequences=True)(text_embedding)
lstm_layer2 = LSTM(lstm_units, return_sequences=True)(lstm_layer1)
lstm_layer3 = LSTM(lstm_units, return_sequences=True)(lstm_layer2)
lstm_layer4 = LSTM(lstm_units)(lstm_layer3)

# RepeatVector to match time steps for music generation
repeat_layer = RepeatVector(music_sequence_length)(lstm_layer4)

# Concatenate text features with music features
concatenated = Concatenate(axis=-1)([repeat_layer, music_features_input])

# LSTM layer for music generation
music_lstm = LSTM(lstm_units, return_sequences=True)(concatenated)

# Output layer for music generation
music_output = Dense(output_dim)(music_lstm)

# Define the model
model = Model(inputs=[text_input, music_features_input], outputs=music_output)

# Compile the model (specify optimizer and loss function)
model.compile(optimizer='adam', loss='mean_squared_error')

# Print model summary
model.summary()

# Generate some dummy data for demonstration
num_samples = 1000
dummy_text_data = np.random.randint(0, vocab_size, size=(num_samples, text_sequence_length))
dummy_music_features = np.random.random((num_samples, music_sequence_length, music_feature_dim))
dummy_music_labels = np.random.random((num_samples, music_sequence_length, output_dim))

# Train the model
model.fit(x=[dummy_text_data, dummy_music_features], y=dummy_music_labels, epochs=10)

# Save the trained model
model.save('text_to_music_model.h5')

# Load the trained model
loaded_model = tf.keras.models.load_model('text_to_music_model.h5')

# Sample texts for demonstration
train_texts = [
    "Once upon a time in a land far away...",
    "Roses are red, violets are blue...",
    "It was the best of times, it was the worst of times...",
    "To be or not to be, that is the question...",
    # Add more sample texts here
]

# Assuming you have corresponding music features for each text
# Create dummy music features for demonstration
num_samples = len(train_texts)
music_sequence_length = 100
music_feature_dim = 128
train_music_features = np.random.random((num_samples, music_sequence_length, music_feature_dim))

# User input
user_input = input("Enter a text: ")

# Preprocess the text input
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)
max_seq_length = text_sequence_length
processed_text = tokenizer.texts_to_sequences([user_input])
processed_text = tf.keras.preprocessing.sequence.pad_sequences(processed_text, maxlen=max_seq_length, padding='post')

# Generate music features using the loaded model
dummy_music_features = np.random.random((1, music_sequence_length, music_feature_dim))  # Replace with real music features

# Generate multiple segments of music
num_segments = 5  # Number of segments to generate
generated_segments = []

for _ in range(num_segments):
    generated_music = loaded_model.predict([processed_text, dummy_music_features])
    generated_segments.append(generated_music)

# Combine the generated music segments into a single audio segment
combined_audio_segment = AudioSegment.silent(duration=0)  # Initialize an empty audio segment

# Convert and add each generated music segment
for segment in generated_segments:
    scaled_music = (segment - segment.min()) / (segment.max() - segment.min())
    scaled_music = scaled_music * 2 - 1
    audio_data = scaled_music.flatten()
    audio_data = (audio_data * 32767).astype(np.int16)

    audio_segment = AudioSegment(
        audio_data.tobytes(),
        frame_rate=44100,  # Adjust the frame rate as needed
        sample_width=2,    # 16-bit audio
        channels=1          # Mono audio
    )

    combined_audio_segment += audio_segment

# Export the combined audio segment as a WAV file
combined_audio_filename = 'combined_music.wav'
combined_audio_segment.export(combined_audio_filename, format='wav')

print(f"Combined music saved as {combined_audio_filename}")


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_input (InputLayer)        [(None, 50)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 50, 128)      1280000     ['text_input[0][0]']             
                                                                                                  
 lstm (LSTM)                    (None, 50, 256)      394240      ['embedding[0][0]']              
                                                                                                  
 lstm_1 (LSTM)                  (None, 50, 256)      525312      ['lstm[0][0]']                   
                                                                                              