<a href="https://colab.research.google.com/github/AutoImaginary/SingingAI/blob/main/SingingAI_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# *__SingingAI__*：Google Colabで動くAIシンガーソングライター
### 作詞：ChatGPT（モデル：gpt-3.5-turbo-16k） (https://github.com/openai/openai-python)
### 作曲：Re-creation of Creations（ROC） (https://github.com/microsoft/muzic/tree/main/roc)
### 歌：NEUTRINO (https://studio-neutrino.com/)
### 伴奏：GETMusic (https://github.com/microsoft/muzic/tree/main/getmusic)
### 編曲：MusicGen (https://github.com/facebookresearch/audiocraft)
<br>

#### その他使用・参考プログラム
##### オーディオファイルのMIDI変換：Basic Pitch (https://github.com/spotify/basic-pitch)
##### エフェクト：pedalboard (https://github.com/spotify/pedalboard)
##### MIDIをMusicXMLに変換 (https://qiita.com/shimajiroxyz/items/a675075b44cf4c0a7487)
###### etc...

In [None]:
#@title ライブラリのインストール 所要時間：約10分
!git clone https://github.com/microsoft/muzic.git
!git clone https://github.com/AutoImaginary/SingingAI.git
!pip install -r /content/SingingAI/requirements_colab.txt
!pip install midiutil fairseq cnsenti

import gdown
gdown.download('https://drive.google.com/uc?id=1xk_oDVS6vfy4seVTZKQinN47KvIBS4Wx', 'ROC.db', quiet=False)
gdown.download('https://drive.google.com/uc?id=1pYexjuM-WN5DhdtfAklseXLF6v5Rrtht', 'checkpoint_best.pt', quiet=False)

!pip install transformers
!pip install transformers['ja']
!pip install sentencepiece
!pip install pykakasi
!pip install sudachidict_full
!pip install openai

!pip uninstall -y miditoolkit
!pip install miditoolkit

!pip install pyfluidsynth
!apt install fluidsynth
!pip install note-seq
!pip install einops
!pip install midi2audio
!apt-get update -qq && apt-get install -qq libfluidsynth2 build-essential libasound2-dev libjack-dev fluidsynth fluid-soundfont-gm
!pip install basic-pitch
!pip install pedalboard
!pip install -U audiocraft  # stable release
!pip install -U git+https://git@github.com/facebookresearch/audiocraft#egg=audiocraft  # bleeding edge
!pip install -e .
!pip install mutagen

import shutil
!git clone https://github.com/noriakihanya/getmusic.git
shutil.copy("/content/SingingAI/track_generation.py","/content/muzic/getmusic/")
shutil.copy("/content/SingingAI/lr_scheduler.py","/content/muzic/getmusic/getmusic/engine/lr_scheduler.py")
gdown.download('https://drive.google.com/uc?id=1jDxs22GBx9V1Pv-8phn0pur_adBAuRnc', 'checkpoint.pth', quiet=False)
gdown.download('https://drive.google.com/uc?id=1Ntj27eliZ1eIab_5iIFtDNlRH1s46tt3', 'MuseScore_General.sf2', quiet=False)

shutil.move("/content/ROC.db","/content/muzic/roc/")
shutil.move("/content/checkpoint_best.pt","/content/muzic/roc/music-ckps/")
shutil.copy("/content/SingingAI/lyrics_to_melody.py","/content/muzic/roc/")
shutil.copy("/content/SingingAI/parser.py","/usr/local/lib/python3.10/dist-packages/miditoolkit/midi/parser.py")

gdown.download('https://drive.google.com/uc?id=1HxqSPFr4us1Kh9CR_Jsug3Pa-RQforb4', 'NEUTRINO.zip', quiet=False)
gdown.download('https://drive.google.com/uc?id=1C-fAAnGyZPCp2E9cBzskwL8YwrOKIAtN', 'SEVEN.zip', quiet=False)
gdown.download('https://drive.google.com/uc?id=1F9Iayg-lj_WlUSwFmJ6SobXr1n7Q60PZ', 'ZUNDAMON.zip', quiet=False)
gdown.download('https://drive.google.com/uc?id=1-vqDdnS9yFoiLmbnjh8f4e81CZPlzssn', 'NAKUMO.zip', quiet=False)
gdown.download('https://drive.google.com/uc?id=1jE5RXJl0tWU79-9KxT-7otcYuDfpqDnl', 'METAN.zip', quiet=False)
!unzip NEUTRINO.zip
!unzip SEVEN.zip
!unzip ZUNDAMON.zip
!unzip NAKUMO.zip
!unzip METAN.zip
shutil.move("/content/No.7（NEUTRINO-Library）/SEVEN","/content/NEUTRINO/model/")
shutil.move("/content/ずんだもん（NEUTRINO-Library）/ZUNDAMON","/content/NEUTRINO/model/")
shutil.move("/content/ナクモ（NEUTRINO-Library）/NAKUMO","/content/NEUTRINO/model/")
shutil.move("/content/四国めたん（NEUTRINO-Library）/METAN","/content/NEUTRINO/model/")

In [None]:
#@title ライブラリのインポート
import pandas as pd
import re
import jaconv
import copy
import xml.etree.ElementTree as ET
import itertools
from sudachipy import tokenizer
from sudachipy import dictionary
from typing import Union, List, Tuple, Optional, Dict
from typing_extensions import TypedDict
import json
from pathlib import Path

import mido
from mido.midifiles.midifiles import *
from mido.midifiles.midifiles import _dbg
from mido.midifiles.meta import meta_charset, _charset

import note_seq
import fluidsynth

import random
import glob
import os
import shutil

class XFMidiFile(mido.MidiFile):
    @staticmethod
    def has_xfkm_chunk(filepath):
        if type(filepath) is str:
            with open(filepath, "rb") as f:
                buf = f.read()
        else:
            start_pos = filepath.tell()
            buf = filepath.read()
            filepath.seek(start_pos)

        # XF Karaoke Message チャンクの取得
        chunk_type_bytes = b'XFKM'
        return chunk_type_bytes in buf

    # override
    def _load(self, infile):
        if self.debug:
            infile = DebugFileWrapper(infile)

        with meta_charset(self.charset):
            if self.debug:
                _dbg('Header:')

            (self.type,
             num_tracks,
             self.ticks_per_beat) = read_file_header(infile)

            if self.debug:
                _dbg('-> type={}, tracks={}, ticks_per_beat={}'.format(
                    self.type, num_tracks, self.ticks_per_beat))
                _dbg()

            for i in range(num_tracks):
                if self.debug:
                    _dbg('Track {}:'.format(i))

                self.tracks.append(read_track(infile,
                                              debug=self.debug,
                                              clip=self.clip))
                # TODO: used to ignore EOFError. I hope things still work.

            # XFフォーマットに対応させるために以下を追記

            self.xfih = None # XFインフォーメーションヘッダの格納先
            self.xfkm = None # XFカラオケメッセージの格納先

            # midi trackの終了位置を記憶
            mtrk_end_index = infile.tell()
            # infileを最後まで読み込む
            rest_buf = infile.read()

            # XF Information Headerがあれば読み込む
            header = b'XFIH'
            if header in rest_buf:
                start_index = rest_buf.index(header)
                infile.seek(mtrk_end_index+start_index) # infileの位置を調整
                if self.debug:
                    _dbg('Track {}:'.format(header))

                self.xfih = self.read_xf_track(infile,
                                        debug=self.debug,
                                        clip=self.clip)
            # XF Karaoke Messageあれば読み込む
            header = b'XFKM'
            if header in rest_buf:
                start_index = rest_buf.index(header)
                infile.seek(mtrk_end_index+start_index) # infileの位置を調整
                if self.debug:
                    _dbg('Track {}:'.format(header))

                self.xfkm = self.read_xf_track(infile,
                                        debug=self.debug,
                                        clip=self.clip)

            # 念のため、infileのポジションをもとに戻す
            infile.seek(mtrk_end_index)
            # XFフォーマット対応のための追記ここまで

    # 新たに定義。read_trackを一部変えただけ
    @staticmethod
    def read_xf_track(infile, debug=False, clip=False):
        track = MidiTrack()

        name, size = read_chunk_header(infile)

        #if name != b'MTrk':
        if name not in (b'XFIH', b'XFKM'):# ヘッダーの条件を書き換え
            #raise IOError('no MTrk header at start of track')
            raise IOError('no XF header at start of track')# メッセージを変更

        if debug:
            _dbg('-> size={}'.format(size))
            _dbg()

        start = infile.tell()
        last_status = None

        while True:
            # End of track reached.
            if infile.tell() - start == size:
                break

            if debug:
                _dbg('Message:')

            delta = read_variable_int(infile)

            if debug:
                _dbg('-> delta={}'.format(delta))

            status_byte = read_byte(infile)

            if status_byte < 0x80:
                if last_status is None:
                    raise IOError('running status without last_status')
                peek_data = [status_byte]
                status_byte = last_status
            else:
                if status_byte != 0xff:
                    # Meta messages don't set running status.
                    last_status = status_byte
                peek_data = []

            if status_byte == 0xff:
                msg = read_meta_message(infile, delta)

            elif status_byte in [0xf0, 0xf7]:
                # TODO: I'm not quite clear on the difference between
                # f0 and f7 events.
                msg = read_sysex(infile, delta, clip)
            else:
                msg = read_message(infile, status_byte, peek_data, delta, clip)

            track.append(msg)

            if debug:
                _dbg('-> {!r}'.format(msg))
                _dbg()

        return track

    @staticmethod
    def get_xflyricinfo(filepath):
        if type(filepath) is str:
            with open(filepath, "rb") as f:
                buf = f.read()
        else:
            start_pos = filepath.tell()
            buf = filepath.read()
            filepath.seek(start_pos)

        # XF Karaoke Message チャンクの取得
        chunk_type_bytes = b'XFKM'
        if chunk_type_bytes not in buf:
            return (None, None, None, None)

        index = buf.index(chunk_type_bytes)
        xfkm_bytes = buf[index:]
        information_header_index = xfkm_bytes.index(b'\xff\x07')
        information_header_data_length = xfkm_bytes[information_header_index+2] # FF 07 len textという構成
        information_header = xfkm_bytes[information_header_index: information_header_index+3+information_header_data_length]
        # 情報を取得（日本語はないはずなので文字コードは気にせずdecodeしてsplit）
        id, melody_channel, offset, lang = information_header[3:].decode().split(":")
        return id, melody_channel, offset, lang

class Note(TypedDict, total=False):
    type: str
    channel: int
    velocity: int
    note: int
    start_time: int
    duration_time: int
    start_division: int
    duration_division: int
    lyric_raw: str
    lyric_surface: str
    lyric_pronunciation: str

class Measure(TypedDict, total=False):
    start_time: int
    measure_id: int
    duration_time: int
    duration_division: int
    notes: List[Note]
    time_information: dict

class TimeInformation(TypedDict, total=False):
    numerator: int
    denominator: int
    ticks_per_beat: int
    start_time: int
    tempo: int
    notated_32nd_notes_per_beat: int
    ticks_per_measure: int
    start_measure_id: int
    division_note: int
    divisions_per_measure: int
    ticks_per_division: int

class MIDI2MusicXML:
    DEFAULT_TICKS_PER_BEAT = 480
    DEFAULT_TEMPO = 60000
    DEFAULT_NUMERATOR = 4
    DEFAULT_DENOMINATOR = 4
    DEFAULT_NOTATED_32ND_NOTES_PER_BEAT = 8
    DEFUALT_DIVISION_NOTE = 32

    @staticmethod
    def _add_start_time(messages, *, start_key = "start_time", deltatime_key = "time"):
            # 開始時間を付与
            deltatimes = [v[deltatime_key] for v in messages]
            start_times_iter = itertools.accumulate(deltatimes)
            messages = copy.deepcopy(messages)
            messages_with_start_time = []
            for v, s in zip(messages, start_times_iter):
              v[start_key] = s
              messages_with_start_time.append(v)
            return messages_with_start_time

    @staticmethod
    def _get_duration_times(note_ons, note_offs):
        note_ons = [("on", pitch,start_time) for pitch, start_time in note_ons]
        note_offs = [("off", pitch,start_time) for pitch, start_time in note_offs]
        # note_onとoffをまとめてstart_timeの昇順
        notes = sorted(note_ons+note_offs, key=lambda x: x[-1])

        duration_times = []
        start_times = []
        pitch_to_note_pos = {}
        for i, (type_, pitch, start_time) in enumerate(notes):
            if type_ == "on":
                duration_times.append(-1)
                start_times.append(start_time)
                pitch_to_note_pos[pitch] = len(start_times) - 1
            elif type_ == "off":
                if pitch in pitch_to_note_pos:
                    pos = pitch_to_note_pos[pitch]
                    duration_times[pos] = start_time - start_times[pos]
            # このケースは発生しないはず
                else:
                    print("there is no note corresponding this note_off")
                    continue
        return duration_times

    @classmethod
    def _add_duration_time(cls, note_on_and_offs, *
                              , start_key = "start_time"
                              , pitch_key = "note"
                              , type_key = "type"
                              , duration_key = "duration_time"
                              , note_on_type = "note_on"
                              , note_off_type = "note_off"):
        # 各noteの持続時間を計算
        note_ons = [(v[pitch_key], v[start_key]) for v in note_on_and_offs if v[type_key] == note_on_type]
        note_offs = [(v[pitch_key], v[start_key]) for v in note_on_and_offs if v[type_key] == note_off_type]
        duration_times = cls._get_duration_times(note_ons, note_offs)
        notes = [v.copy() for v in note_on_and_offs if v[type_key] == note_on_type]
        notes_with_duration_time = []
        for note, duration_time in zip(notes, duration_times):
            note[duration_key] = duration_time
            notes_with_duration_time.append(note)
        return notes_with_duration_time

    # messagesからtime_informationの情報を取得
    @classmethod
    def _get_time_informations(cls, messages, *
                                  , ticks_per_beat = None
                                  , tempo = None
                                  , numerator = None
                                  , denominator = None
                                  , notated_32nd_notes_per_beat = None
                                  , division_note = None
                               ):
        ticks_per_beat = ticks_per_beat or cls.DEFAULT_TICKS_PER_BEAT
        tempo = tempo or cls.DEFAULT_TEMPO
        numerator = numerator or cls.DEFAULT_NUMERATOR
        denominator = denominator or cls.DEFAULT_DENOMINATOR
        notated_32nd_notes_per_beat = notated_32nd_notes_per_beat or cls.DEFAULT_NOTATED_32ND_NOTES_PER_BEAT
        division_note = division_note or cls.DEFUALT_DIVISION_NOTE

        time_informations: List[TimeInformation] = []

        time_information: TimeInformation = {
          "numerator": numerator
          , "denominator": denominator
          , "ticks_per_beat": ticks_per_beat
          , "start_time": 0
          , "tempo": tempo
          , "notated_32nd_notes_per_beat": notated_32nd_notes_per_beat
          , "ticks_per_measure": int(ticks_per_beat / notated_32nd_notes_per_beat * 32 / denominator * numerator)
          , "start_measure_id": 0
          , "division_note": division_note
          , "divisions_per_measure": int(division_note / denominator * numerator)
          , "ticks_per_division": int(ticks_per_beat / notated_32nd_notes_per_beat * 32 / division_note)
        }

        time_informations.append(time_information)

        for m in messages:
          if m["type"] == "set_tempo":
            new_time_information = time_informations[-1].copy()
            new_time_information["tempo"] = m["tempo"]
            new_time_information["start_time"] = m["start_time"]
            if new_time_information["start_time"] == time_informations[-1]["start_time"]:
              time_informations[-1] = new_time_information
            else:
              duration_time = m["start_time"] - time_informations[-1]["start_time"]
              time_informations[-1]["duration_time"] = duration_time
              measure_num = int(duration_time/time_informations[-1]["ticks_per_measure"])
              time_informations[-1]["measure_num"] = measure_num
              new_time_information["start_measure_id"] = time_informations[-1]["start_measure_id"] + measure_num
              time_informations.append(new_time_information)
          elif m["type"] == "time_signature":
            """todo 必ず全部あるとは限らない？"""
            new_time_information = time_informations[-1].copy()
            new_time_information["numerator"] = m["numerator"]
            new_time_information["denominator"] = m["denominator"]
            new_time_information["notated_32nd_notes_per_beat"] = m["notated_32nd_notes_per_beat"]
            new_time_information["ticks_per_measure"] = int(new_time_information["ticks_per_beat"] / m["notated_32nd_notes_per_beat"] * 32  * m["numerator"] / m["denominator"])
            new_time_information["divisions_per_measure"] = int( new_time_information["division_note"]  * m["numerator"]/ m["denominator"])
            new_time_information["ticks_per_division"] = int(new_time_information["ticks_per_beat"] / new_time_information["notated_32nd_notes_per_beat"] * 32 / new_time_information["division_note"])
            new_time_information["start_time"] = m["start_time"]
            if new_time_information["start_time"] == time_informations[-1]["start_time"]:
              time_informations[-1] = new_time_information
            else:
              duration_time = m["start_time"] - time_informations[-1]["start_time"]
              time_informations[-1]["duration_time"] = duration_time
              measure_num = int(duration_time/time_informations[-1]["ticks_per_measure"])
              time_informations[-1]["measure_num"] = measure_num
              new_time_information["start_measure_id"] = time_informations[-1]["start_measure_id"] + measure_num
              time_informations.append(new_time_information)
        duration_time = m["start_time"] - time_informations[-1]["start_time"]
        time_informations[-1]["duration_time"] = duration_time
        measure_num = int(duration_time/time_informations[-1]["ticks_per_measure"])
        time_informations[-1]["measure_num"] = measure_num + 1

        return time_informations

    # notesが空の状態のmeasureのリストを取得する
    @staticmethod
    def _get_measures(time_informations):
        measures: List[Measure] = []
        for t in time_informations:
          for i in range(t["measure_num"]):
            measure_id = t["start_measure_id"] + i
            measure: Measure = {
              "measure_id": measure_id
              , "start_time": t["start_time"] + i * t["ticks_per_measure"]
              , "time_information": t
            }
            measures.append(measure)
        return measures

    @staticmethod
    def _split_notes_by_measure(notes, measure_end_times, *
                                , start_key = "start_time"
                                ):
        note_measures = [[] for _ in measure_end_times]
        measure_id = 0
        for note in notes:
          if measure_id == len(measure_end_times) - 1:
            note_measures[measure_id].append(note)
          else:
            while note[start_key] >= measure_end_times[measure_id]:
              measure_id += 1
            note_measures[measure_id].append(note)
        return note_measures

    @staticmethod
    def _add_tie(note_measures: List[List[Note]], measure_end_times: List[int], *
                                  , start_key = "start_time"
                                  , duration_key = "duration_time"
                                  , tie_type_key = "tie_type"
                                  , stop_type = "stop"
                                  , start_type = "start"
                                  ) -> List[List[Note]]:
        note_measures_with_tie: List[List[Note]] = [[] for _ in measure_end_times]
        for i, (notes, measure_end_time) in enumerate(zip(note_measures, measure_end_times)):
          for j, note in enumerate(notes):
            if j != len(notes) - 1:
                note_measures_with_tie[i].append(note)
                continue
            # 最後の音符がmeasureをまたぐとき
            if note[start_key] + note[duration_key] > measure_end_time:
                last_note = copy.deepcopy(note)
                last_note[duration_key] = measure_end_time - note[start_key]
                last_note[tie_type_key] = last_note.get(tie_type_key, []) + [start_type]
                note_measures_with_tie[i].append(last_note)

                rest_note = copy.deepcopy(note)
                rest_note[start_key] = measure_end_time
                rest_note[duration_key] = note[start_key] + note[duration_key] - measure_end_time
                rest_note[tie_type_key] = [stop_type]
                note_measures_with_tie[j+1].append(rest_note)
                #note_measures_with_tie[i+1].append(rest_note)
            # またがないときは単に追加
            else:
                note_measures_with_tie[i].append(note)
        return note_measures_with_tie

    @staticmethod
    def _add_division(note_measures: List[List[Note]], measure_start_times: List[int], measure_ticks_per_divisions: List[int], *
                                    , start_time_key = "start_time"
                                    , start_division_key = "start_division"
                                    , duration_time_key = "duration_time"
                                    , duraiton_division_key = "duration_division"
                                      ) -> List[List[Note]]:
        note_measures_with_division = [[] for _ in measure_start_times]
        #print(note_measures)
        for i, (notes, measure_start_time, measure_ticks_per_division) in enumerate(zip(note_measures, measure_start_times, measure_ticks_per_divisions)):
          tick2division = lambda x: round(x/measure_ticks_per_division)
          #tick2division = lambda x: math.ceil(x/measure_ticks_per_division)
          for note in notes:
            note[start_division_key] = tick2division(note[start_time_key] - measure_start_time)
            note[duraiton_division_key] = tick2division(note[duration_time_key])
            #print(note["duration_time"],note["duration_division"], measure_ticks_per_division)
            note_measures_with_division[i].append(note)
        return note_measures_with_division

    @staticmethod
    def _add_rest(notes: List[Note], divisions_per_measure: List[int], *
                          , start_key = "start_division"
                          , type_key = "type"
                          , rest_type = "rest"
                          , duration_key = "duration_division"
                          ):
        last_division = 0
        measure_with_rest = []
        for note in notes:
          diff_division = note[start_key] - last_division
          if diff_division > 0:
            rest = {}
            rest[type_key] = rest_type
            rest[duration_key] = diff_division
            rest[start_key] = last_division
            measure_with_rest.append(rest)
          measure_with_rest.append(note)
          last_division = note[start_key] + note[duration_key]
        if last_division < divisions_per_measure:
          rest = {}
          rest[type_key] = rest_type
          rest[duration_key] = divisions_per_measure - last_division
          rest[start_key] = last_division
          measure_with_rest.append(rest)

        return measure_with_rest

    @classmethod
    def _get_lyric(cls, start_time, lyric_dict):
        if start_time in lyric_dict:
          raw_text = lyric_dict[start_time]
          surface, pronunciation = cls._format_lyric(raw_text) # lryicの修正
          return raw_text, surface, pronunciation
        else:
          return None, None, None

    @classmethod
    def _add_lyric(cls, note, lyric_dict, *
                          , note_type = "note_on"
                          , type_key = "type"
                          , start_key = "start_time"
                          , lyric_raw_key = "lyric_raw"
                          , lyric_surface_key = "lyric_surface"
                          , lyric_pronunciation_key = "lyric_pronunciation"
                          ):
        note = copy.deepcopy(note)
        if note[type_key] != note_type:
          pass
        else:
          raw, surface, pronunciation = cls._get_lyric(note[start_key], lyric_dict)
          if raw is not None:
            note[lyric_raw_key], note[lyric_surface_key], note[lyric_pronunciation_key] = raw, surface, pronunciation
        return note

    @staticmethod
    def _format_lyric(text):
        # 閉じ括弧の削除
        text = text.split("]")[0]
        # カタカナをひらがなに直す
        text = jaconv.kata2hira(text)
        if "[" in text:
          surface = text.split("[")[0]
          pronunciation = text.split("[")[1]
        else:
          surface, pronunciation = text, text
        # pronunciationからひらがなと長音以外を削除
        pronunciation = re.sub("[^\u3041-\u309Fー]", "", pronunciation)
        return surface, pronunciation

    @staticmethod
    def _get_fixed_pronunciations(note_measures, *
                        , surface_key = "lyric_surface"
                        ):
        tokenizer_obj = dictionary.Dictionary(dict_type="full").create()
        mode = tokenizer.Tokenizer.SplitMode.A

        note_measures = copy.deepcopy(note_measures)
        # surfaceの位置をmeasure, noteの位置に変換
        surface_pos_to_note_pos = {}
        text = ""
        for measure_id, measure in enumerate(note_measures):
            for note_id, note in enumerate(measure):
                surface = note.get(surface_key,"")
                if surface == "": continue

                for c in surface:
                    surface_pos_to_note_pos[len(text)] = (measure_id, note_id)
                    text += c
        pronunciations = []
        tokens = tokenizer_obj.tokenize(text ,mode)
        surface_pos = 0
        for token in tokens:
          surface, pos = token.surface(), token.part_of_speech()[0]
          if surface == "は" and pos == "助詞": #助詞の「は」は「わ」になおす
            measure_id, note_id = surface_pos_to_note_pos[surface_pos]
            pronunciations.append(["わ", measure_id, note_id])
          elif surface == "へ" and pos == "助詞": # 助詞の「へ」は「え」になおす
            measure_id, note_id = surface_pos_to_note_pos[surface_pos]
            pronunciations.append(["え", measure_id, note_id])
          surface_pos += len(surface)
        return pronunciations

    @staticmethod
    def _add_rule_base_lyric(note_measures, *
                                        , lyric_raw_key = "lyric_raw"
                                        , lyric_surface_key = "lyric_surface"
                                        , lyric_pronunciation_key = "lyric_pronunciation"
                                        , type_key = "type"
                                        , note_type = "note_on"
                                        ):
        note_measures = copy.deepcopy(note_measures)
        notes_flatten = []
        for measure_id, notes in enumerate(note_measures):
          notes_flatten += [(measure_id, note) for note in notes]
        temp = [[] for _ in note_measures]
        for i, (measure_id, note) in enumerate(notes_flatten):
          if lyric_raw_key in note:
            temp[measure_id].append(note)
          elif note[type_key] != note_type:
            temp[measure_id].append(note)
          else:
            if i == 0 or notes_flatten[i-1][1][type_key] != note_type:
              note[lyric_raw_key] = ""
              note[lyric_surface_key] = "あ"
              note[lyric_pronunciation_key] = "あ"
            else:
              #note[lyric_raw_key] = ""
              #note[lyric_surface_key] = "ー"
              #note[lyric_pronunciation_key] = "ー"
              note[type_key] = "rest"
            temp[measure_id].append(note)
        return temp

    @staticmethod
    def _get_template_xml(path):
        tree = ET.parse(path)
        root = tree.getroot()
        template = {
          "note": copy.deepcopy(root.find("./part//note"))
          , "rest": copy.deepcopy(root.find(".//note/rest/.."))
          , "attribute": copy.deepcopy(root.find("./part//attributes"))
          , "measure": copy.deepcopy(root.find("./part//measure"))
          , "tree": tree
        }
        # measureの中身を空にする
        template["measure"].clear()
        return template

    @staticmethod
    def _get_attribute_xml(attribute_xml_template, time_information: TimeInformation,tempo):
        t = time_information
        attribute = copy.deepcopy(attribute_xml_template)
        # beats_per_minuteを計算

        attribute.find("./divisions").text = str(t["notated_32nd_notes_per_beat"])
        """
        # beat情報があれば更新する
        beats = measure.find(".//attributes//time/beats")
        beattype = measure.find(".//attributes//time/beat-type")
        """
        attribute.find(".//time/beats").text = str(t["numerator"])
        attribute.find(".//time/beat-type").text = str(t["denominator"])
        # beats_per_minuteを計算
        #beats_per_minute = int(60/t["tempo"]*1e6)
        beats_per_minute = tempo
        attribute.find(".//sound").attrib["tempo"] = str(beats_per_minute)
        return attribute

    # MIDIのnote番号をmusicxmlの音階に変換
    @staticmethod
    def _note_to_pitch(note_number: int) -> Tuple[str, int]:
        # 60がC4
        octave = note_number // 12 - 1
        step_num = note_number % 12
        steps = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
        return steps[step_num], octave

    @classmethod
    def _get_note_xml(cls, note_xml_template, note: Note):
        note_xml = copy.deepcopy(note_xml_template)
        step, octave = cls._note_to_pitch(int(note["note"]))
        # octaveは基本0以上のはずだが念の為チェック
        if octave < 0:
          print("warning: octave is too small. reset to 0")
          octave = 0
        if octave > 9:
          print("warning: octave is too large. reset to 9")
          octave = 9
        note_xml.find(".//step").text = str(step[0]) # シャープの可能性があるので最初の１文字だけ
        note_xml.find(".//octave").text = str(octave)
        # シャープの判定
        is_sharp = (len(step) == 2)
        if is_sharp:
          ET.SubElement(note_xml.find(".//pitch"), "alter")
          note_xml.find(".//alter").text = "1"
        # タイの処理
        if "tie_type" in note:
          ties = note["tie_type"]
          for tie in ties:
            ET.SubElement(note_xml, "tie")
            note_xml.find(".//tie").attrib["type"] = tie
        # 歌詞の追加
        #print(note)
        note_xml.find(".//lyric/text").text = note["lyric_pronunciation"]
        # durationの追加
        note_xml.find(".//duration").text = str(note["duration_division"] * random.choice([0.75,1.0]))
        # 追加
        return note_xml

    @staticmethod
    def _get_rest_xml(rest_xml_template, note):
        note_xml = copy.deepcopy(rest_xml_template)
        note_xml.find("./duration").text = str(note["duration_division"])
        return note_xml

    @classmethod
    def _get_musicxml_tree(cls, measures: List[Measure], template_path: str
                          ) -> ET.ElementTree:
        template = cls._get_template_xml(template_path)

        tree = ET.parse(template_path)
        root = tree.getroot()
        # partの中身を空にする
        part = root.find("./part")
        part.clear()

        time_information = None
        for measure in measures:
          measure_xml = copy.deepcopy(template["measure"])
          if measure["time_information"] != time_information:
            time_information = measure["time_information"]
            attribute = cls._get_attribute_xml(template["attribute"], time_information, tempo)
            measure_xml.append(attribute)

          for note in measure["notes"]:
            if note["type"] == "note_on":
              note_xml = cls._get_note_xml(template["note"], note)
              # 追加
              measure_xml.append(note_xml)
            elif note["type"] == "rest":
              rest_xml = cls._get_rest_xml(template["rest"], note)
              # 追加
              measure_xml.append(rest_xml)
            else:
              # このケースは存在しないはず
              pass
          # partに追加
          part.append(measure_xml)
        return tree

    @staticmethod
    def _get_musicxml_header_string(template_path: str
                                ) -> str:
        # 最初の２行を取り出す
        with open(template_path) as f:
          header = "\n".join(f.read().splitlines()[:2])
        return header

    @classmethod
    def _get_musicxml_string(cls, measures: List[Measure], template_path: str) -> str:
        tree = cls._get_musicxml_tree(measures, template_path)
        tree_string = ET.tostring(tree.getroot(), encoding="unicode")
        header_string = cls._get_musicxml_header_string(template_path)
        return header_string + "\n" + tree_string

os.makedirs("/content/00_tmp/", exist_ok=True)
os.makedirs("/content/01_input", exist_ok=True)
os.makedirs("/content/02_output_midi", exist_ok=True)
os.makedirs("/content/03_output_mp3", exist_ok=True)

print("完了")

# Chat-gptによる歌詞の生成

In [3]:
#@title 歌詞の入力・ひらがな変換
from ipywidgets import *
from IPython.display import display, clear_output
from time import sleep
from datetime import datetime
import openai
import pykakasi
import re
import os

open_apikey = Text(value='',
            placeholder='API KEY',
            description='OpenAI:',
            disabled=False,
            layout=Layout(width='60%', height='40px'))

text_wrt = Text(value='YOASOBI',
            placeholder='入力',
            description='AI ライター:',
            disabled=False,
            layout=Layout(width='60%', height='40px'))

text_thm = Text(value='',
            placeholder='入力',
            description='テーマ:',
            disabled=False,
            layout=Layout(width='60%', height='40px'))

lyrics = Textarea(value='',
            placeholder='入力',
            description='歌詞:',
            disabled=False,
            layout=Layout(width='60%', height='200px'))

hira = Textarea(value='',
            placeholder='',
            description='ひらがな:',
            disabled=False,
            layout=Layout(width='60%', height='400px'))

button_lyr = widgets.Button(description="AI作詞")
output_lyr = widgets.Output()  # 出力先
fp_lyr = FloatProgress(min=0, max=100, step=0.1, description='処理状況:', layout=Layout(width='60%', height='40px'))

button_hira = widgets.Button(description="ひらがな変換")
output_hira = widgets.Output()  # 出力先
fp_hira = FloatProgress(min=0, max=100, step=0.1, description='処理状況:', layout=Layout(width='60%', height='40px'))

button_hira_re = widgets.Button(description="決定")
output_done = widgets.Output()

def on_button_lyr_clicked(b):
    output_lyr.clear_output(True)  # 前のクリック時の出力を消す
    for i in range(0,51):
      sleep(0.1)
      fp_lyr.value = i

    openai.api_key = open_apikey.value
    completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-16k",
    messages=[{"role": "system", "content": f'あなたは{text_wrt.value}です。{text_thm.value}をテーマにした歌詞を作ってください'}])

    for j in range(50,101):
      sleep(0.1)
      fp_lyr.value = j

    with output_lyr:
        lyrics.value = completion["choices"][0]["message"]["content"].strip()

def on_button_hira_clicked(b):
    output_hira.clear_output(True)
    kks = pykakasi.kakasi()

    lyrics_s = lyrics.value

    path_w = "/content/00_tmp/singingai_w.txt"
    if(os.path.isfile(path_w)):
      os.remove(path_w)

    with open(path_w, mode='w') as f:
      f.write(lyrics_s)

    output=""
    with open(path_w, encoding="utf-8") as f:
      for line in f:
        if not line.isspace():
          line = re.sub('\(.+?\)', '', line)
          line = line.replace('、', ' ')
          line = line.replace('　', ' ')
          line = line.replace('\n', ' ')
          output+=line
    f = open(path_w,"w")
    f.write(output)

    with open(path_w, encoding="utf-8") as f:
      lyrics_s = f.read()

    for i in range(0,51):
      sleep(0.1)
      fp_hira.value = i
    result = kks.convert(lyrics_s)

    for j in range(50,101):
      sleep(0.1)
      fp_hira.value = j
    kasi = ''.join([item['hira'] for item in result])
    kasi = kasi.replace(' ','\n')
    with output_hira:
        hira.value = kasi

@output_done.capture(clear_output=True)
def on_button_re_clicked(b):
    kasi_re = hira.value
    kasi_re = 'ja ' + kasi_re.replace('\n',' ')

    path_hira = "/content/00_tmp/singingai_ja_hira.txt"
    if(os.path.isfile(path_hira)):
      os.remove(path_hira)

    with open(path_hira,"w") as f:
      f.write(kasi_re)

    print(f'完了 {datetime.now()}')

def check_len(submit):
    """入力した文字列の数を数えて表示"""
    html.value="入力文字数："+str(len(submit['new']))

hira.observe(check_len, names='value')
html=HTML(value="※入力文字数："+str(len(hira.value)))

display(open_apikey)
display(text_wrt, text_thm)
button_lyr.on_click(on_button_lyr_clicked)
display(button_lyr, output_lyr, fp_lyr)
display(lyrics)

display(button_hira,fp_hira,hira)
display(html)
display(HTML(value="無料版GoogleColabの場合、250文字以内推奨 ※後の処理でメモリをオーバーすることがあります"))
display(HTML(value="漢字が正しくひらがなに変換されないことがあります。その場合は訂正してください。"))
display(HTML(value="NEUTRINOでは、「ー」（伸ばし棒）がエラーとなることがあります。"))
button_hira.on_click(on_button_hira_clicked)
display(button_hira_re, output_done)

button_hira_re.on_click(on_button_re_clicked)

Text(value='', description='OpenAI:', layout=Layout(height='40px', width='60%'), placeholder='API KEY')

Text(value='YOASOBI', description='AI ライター:', layout=Layout(height='40px', width='60%'), placeholder='入力')

Text(value='', description='テーマ:', layout=Layout(height='40px', width='60%'), placeholder='入力')

Button(description='AI作詞', style=ButtonStyle())

Output()

FloatProgress(value=0.0, description='処理状況:', layout=Layout(height='40px', width='60%'))

Textarea(value='', description='歌詞:', layout=Layout(height='200px', width='60%'), placeholder='入力')

Button(description='ひらがな変換', style=ButtonStyle())

FloatProgress(value=0.0, description='処理状況:', layout=Layout(height='40px', width='60%'))

Textarea(value='', description='ひらがな:', layout=Layout(height='400px', width='60%'), placeholder='')

HTML(value='※入力文字数：0')

HTML(value='無料版GoogleColabの場合、250文字以内推奨 ※後の処理でメモリをオーバーすることがあります')

HTML(value='漢字が正しくひらがなに変換されないことがあります。その場合は訂正してください。')

HTML(value='NEUTRINOでは、「ー」（伸ばし棒）がエラーとなることがあります。')

Button(description='決定', style=ButtonStyle())

Output()

# ROCによるメロディの生成

In [None]:
#@title メロディーの生成
from sys import set_coroutine_origin_tracking_depth

tempo = 120 #@param {type:"slider", min:80, max:160, step:1}
song_tempo = tempo

%cd /content/muzic/roc

for file in glob.glob('/content/00_tmp/*.mid'):
    os.remove(file)

while True:
    chord_list = []

    root = ["C:","C#:","D:","Eb:","E:","F:","F#:","G:","Ab:","A:","Bb:","B:"]
    chord_kind = ["","m","+","dim","7","maj7","m7","m7b5"]

    root_list = random.sample(root,5)
    for l in root_list:
      chord_list.append(f'{l}{random.choice(chord_kind)}')

    chord_list = ' '.join(chord_list)

    with open("/content/00_tmp/chord.txt", "w") as f:
      f.write(chord_list)

    !python /content/muzic/roc/lyrics_to_melody.py \
    --lyrics_path=/content/00_tmp/singingai_ja_hira.txt \
    --chord_path=/content/00_tmp/chord.txt \
    --db_path=/content/muzic/roc/ROC.db \
    --debug='true' \
    --sentiment='true' \
    --tempo=$tempo

    if glob.glob('/content/muzic/roc/*.mid') != []:
        break
    else:
        print("再作成...")

for file in glob.glob('/content/muzic/roc/*.mid'):
    shutil.move(file, '/content/00_tmp/')

#MIDIファイルの表示
for file in glob.glob('/content/00_tmp/*.mid'):
    filepath = file

SAMPLE_RATE = 16000
SF2_PATH = '/content/MuseScore_General.sf2'

#melody_ns = note_seq.midi_file_to_note_sequence(filepath)

#melody_instrument = note_seq.infer_melody_for_sequence(melody_ns)
#notes = [note for note in melody_ns.notes if note.instrument == melody_instrument]
#del melody_ns.notes[:]
#melody_ns.notes.extend(sorted(notes, key=lambda note: note.start_time))

#note_seq.play_sequence(melody_ns, synth=note_seq.fluidsynth, sample_rate=SAMPLE_RATE, sf2_path=SF2_PATH)
#note_seq.plot_sequence(melody_ns)

# NEUTRINOで歌声生成

In [None]:
#@title 歌ファイルの作成

#MIDIからmusicXMLを生成
for file in glob.glob('/content/00_tmp/*.mid'):
    filepath = file

midi = mido.MidiFile(filepath)
messages = [m.dict() for m in midi.tracks[1]]
for n in messages:
    if n.get("velocity") == 0:
        n["type"] = "note_off"
messages = MIDI2MusicXML._add_start_time(messages)
notes_on = [m for m in messages if m["type"] in ("note_on") and m.get("velocity") !=0 and m.get("channel", -1) == 0 ]
notes_off = [m for m in messages if m["type"] in ("note_off") and m.get("velocity") ==0 and m.get("channel", -1) == 0 ]

temp1: List[Note] = []
for note_on,note_off in zip(notes_on,notes_off):
    if note_on["note"] == note_off["note"]:
        if note_off["time"]==0:
            note_time=note_on["time"]
        else:
            note_time=note_off["time"]
        note: Note = {"type": note_on["type"]
                    , "start_time": note_on["start_time"]
                    , "duration_time": note_time
                    , "note": note_on["note"]
                    }
        temp1.append(note)

temp2: List[Note] = []
for t in range(len(temp1)):
      if t != (len(temp1)-1):
            note_time = temp1[t+1]["start_time"] - temp1[t]["start_time"]
      else:
            #note_time = measures[1]["start_time"]
            note_time = 1920

      note: Note = {"type": temp1[t]["type"]
                    , "start_time": temp1[t]["start_time"]
                    , "duration_time": note_time
                    , "note": temp1[t]["note"]
                    }
      temp2.append(note)

notes = temp2

time_informations = MIDI2MusicXML._get_time_informations(messages)
measures = MIDI2MusicXML._get_measures(time_informations)

measure_end_times = [m["start_time"]+m["time_information"]["ticks_per_measure"] for m in measures]
note_measures = MIDI2MusicXML._split_notes_by_measure(notes, measure_end_times)

# タイの情報を追加
measure_end_times = [(m["start_time"]+m["time_information"]["ticks_per_measure"]) for m in measures]
note_measures = MIDI2MusicXML._add_tie(note_measures, measure_end_times)

# noteにmeasure起点のdivisionと持続divisionを追加
measure_start_times = [m["start_time"] for m in measures]
measure_ticks_per_divisions = [m["time_information"]["ticks_per_division"] for m in measures]
note_measures = MIDI2MusicXML._add_division(note_measures, measure_start_times, measure_ticks_per_divisions)

# 休符の追加
divisions_per_measures = [m["time_information"]["divisions_per_measure"] for m in measures]
note_measures = [MIDI2MusicXML._add_rest(notes, divisions_per_measure) for notes, divisions_per_measure in zip(note_measures, divisions_per_measures)]

with open("/content/00_tmp/singingai.json", "w") as f:
    json.dump(note_measures, f, indent=2, ensure_ascii=False)

with open("/content/00_tmp/singingai.json") as f:
  note_measures = json.load(f)

midi = mido.MidiFile(filepath,charset='utf-8')
messages = [m.dict() for m in midi.tracks[0]]
lyrics = MIDI2MusicXML._add_start_time(messages)

# 楽曲情報を取得
_, melody_channel, _, _ = XFMidiFile.get_xflyricinfo(filepath)
melody_channel = 0

#歌詞と"start_time"を紐付け
d_start = []
d_text = []
for v in temp2:
    d_start.append(v['start_time'])

d_start = list(dict.fromkeys(d_start))

for v in lyrics[2:]:
    if v["type"] == 'lyrics':
        d_text.append(v['text'])
    else:
        pass

d_lyric = dict(zip(d_start, d_text))
print(d_lyric)
lyric_dict = d_lyric

for notes in note_measures:
  for note in notes:
    if note["type"] == "note_on":
      start_time = note["start_time"]

note_measures = [[MIDI2MusicXML._add_lyric(note, lyric_dict) for note in notes] for notes in note_measures]

# 「は」、「へ」を「わ」「え」になおす
fixed_pronunciations = MIDI2MusicXML._get_fixed_pronunciations(note_measures)
for pronunciation, measure_id, note_id in fixed_pronunciations:
    note_measures[measure_id][note_id]["lyric_pronunciation"] = pronunciation

#ルールベースに基づき、
note_measures = MIDI2MusicXML._add_rule_base_lyric(note_measures)

messages = [m.dict() for m in midi.tracks[0]]
messages = MIDI2MusicXML._add_start_time(messages)
time_informations = MIDI2MusicXML._get_time_informations(messages)
measures = MIDI2MusicXML._get_measures(time_informations)
#measure_numが怪しい...
for i in range(len(measures)):
    measures[i]["notes"] = note_measures[i]

with open("/content/00_tmp/singingai_measures_with_notes.json", "w") as f:
  json.dump(measures, f, indent=2, ensure_ascii=False)

with open("/content/00_tmp/singingai_measures_with_notes.json") as f:
  measures = json.load(f)

template_path = "/content/SingingAI/template.musicxml"
template = MIDI2MusicXML._get_template_xml(template_path)

tempo = song_tempo * 2
tree = MIDI2MusicXML._get_musicxml_tree(measures, template_path)
ET.dump(tree)

header = MIDI2MusicXML._get_musicxml_header_string(template_path)
musicxml_string = MIDI2MusicXML._get_musicxml_string(measures, template_path)

with open("/content/00_tmp/singingai.musicxml", "w") as f:
  f.write(musicxml_string)

#NEUTRINOの実行
%cd /content/NEUTRINO
!bash /content/SingingAI/permission.sh

vocal = "ZUNDAMON" #@param ["MERROW","NAKUMO", "SEVEN", "ZUNDAMON", "METAN"] {allow-input: false}

with open("/content/NEUTRINO/Run.sh", mode="r") as f:
  basename = f.read()
  basename = basename.replace('ModelDir=MERROW', f'ModelDir={vocal}')
  basename = basename.replace('BASENAME=sample1', 'BASENAME=singingai')

with open("/content/NEUTRINO/singingai_Run.sh", mode="w") as g:
  g.write(basename)

import shutil
src_path = "/content/00_tmp/singingai.musicxml"
copy_path = "/content/NEUTRINO/score/musicxml/singingai.musicxml"
shutil.copy(src_path, copy_path)

!sh /content/NEUTRINO/singingai_Run.sh

In [7]:
#@title wavファイルの再生
import IPython.display
from pedalboard import *
from pedalboard.io import AudioFile

# Make a Pedalboard object, containing multiple audio plugins:
board = Pedalboard([Chorus(),
                    Compressor(),
                    Reverb(room_size=0.25),
                    Gain(gain_db=4),])

# Open an audio file for reading, just like a regular file:
with AudioFile("/content/NEUTRINO/output/singingai_world.wav") as f:

  # Open an audio file to write to:
  with AudioFile("/content/NEUTRINO/output/singingai_world_efct.wav", 'w', f.samplerate, f.num_channels) as o:

    # Read one second of audio at a time, until the file is empty:
    while f.tell() < f.frames:
      chunk = f.read(f.samplerate)

      # Run the audio through our pedalboard:
      effected = board(chunk, f.samplerate, reset=False)

      # Write the output to our output file:
      o.write(effected)

print("エフェクト無し")
IPython.display.display(IPython.display.Audio("/content/NEUTRINO/output/singingai_world.wav"))
print("エフェクト有り")
IPython.display.display(IPython.display.Audio("/content/NEUTRINO/output/singingai_world_efct.wav"))

エフェクト無し


エフェクト有り


# Getmusicによる伴奏の生成

In [None]:
#@title 伴奏トラックの生成
import glob
import os
import shutil
import mido
import mido
from mido import Message, MidiFile, MidiTrack, MetaMessage

if(os.path.isfile("/content/00_tmp/singingai_world_basic_pitch.mid")):
    os.remove("/content/00_tmp/singingai_world_basic_pitch.mid")

#BasicPitchで歌声をMIDI化
!basic-pitch \
/content/00_tmp \
/content/NEUTRINO/output/singingai_world.wav \
--midi-tempo $song_tempo

#MIDIファイルの編集
filepath = "/content/00_tmp/singingai_world_basic_pitch.mid"
midi = mido.MidiFile(filepath)

def dump_track(track_obj):
    for msg in track_obj:
        if hasattr(msg, "program"):
            msg.program = 80

def dump_smf(midi_obj):
    for i, track in enumerate(midi_obj.tracks):
        dump_track(track)

dump_smf(midi)
midi.save("/content/01_input/new_basic_pitch_transcription.mid")

#getmusicの実行
%cd /content/muzic/getmusic

!python /content/muzic/getmusic/track_generation.py \
--load_path /content/checkpoint.pth \
--file_path /content/01_input \
--output_midi /content/02_output_midi \
--select_condition_tracks l \
--select_content_tracks bcdglps

for file in glob.glob('/content/02_output_midi/l2*.mid'):
  filepath = file
  midi = mido.MidiFile(filepath)

def dump_track_t(track_obj):
    for msg in track_obj:
        if hasattr(msg, "tempo"):
            msg.tempo = mido.bpm2tempo(song_tempo)

def dump_smf_t(midi_obj):
    for track in midi_obj.tracks:
        dump_track_t(track)

dump_smf_t(midi)
midi.save(filepath)

In [10]:
#@title 伴奏を抽出・再生
import mido
from mido import Message, MidiFile, MidiTrack
from mido.midifiles import tracks
from pydub import AudioSegment
from midi2audio import FluidSynth
import IPython.display
import glob
import math

for i in glob.glob(f'/content/02_output_midi/l2*.mid'):
    midi = mido.MidiFile(i)

gomi = MidiFile()
mid = MidiFile()
track = MidiTrack()

i = 0
for t in midi.tracks:
  if i==5:
    gomi.tracks.append(t)
  else:
    mid.tracks.append(t)
  i += 1

mid.save('/content/02_output_midi/acp_tracks.mid')

for j in glob.glob(f'/content/02_output_midi/acp_tracks.mid'):
    MIDI_FILE = j

fs = FluidSynth(sound_font="/content/MuseScore_General.sf2")
acp_mp3 = '/content/03_output_mp3/acp.mp3'
fs.midi_to_audio(MIDI_FILE, acp_mp3)

audio = AudioSegment.from_file(acp_mp3)
# 音量(%)
volume_percent = 400
# 音量を調整して再生
audio = audio + (20 * math.log10(volume_percent/100))
audio.export(acp_mp3, format='mp3')

# ボーカルと伴奏のwavファイルを読みだす
vocal = AudioSegment.from_file('/content/NEUTRINO/output/singingai_world_efct.wav')
acp = AudioSegment.from_file(acp_mp3)
# ミキシングとファイル出力
output = vocal.overlay(acp, position=0)
output.export('/content/03_output_mp3/song.mp3', format='mp3')

print("伴奏")
IPython.display.display(IPython.display.Audio(acp_mp3))
print("歌＋伴奏")
IPython.display.display(IPython.display.Audio('/content/03_output_mp3/song.mp3'))

伴奏


歌＋伴奏


# MusicGenによるトラックの生成

In [1]:
#@title 伴奏の編曲 ※所要時間：2分ほどの歌に対して約10分
#@markdown プロンプトは自由に入力できます
import os
import shutil
import glob
import math
import librosa
import torch
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write, audio_read
from audiocraft.data.audio_utils import convert_audio
from audiocraft.utils.notebook import display_audio
from mutagen.mp3 import MP3
from pydub import AudioSegment
import IPython.display

file = '/content/03_output_mp3/acp.mp3'
audio_time = MP3(file)
duration = audio_time.info.length

model = MusicGen.get_pretrained('facebook/musicgen-melody')
model.set_generation_params(duration=duration)

melody_waveform, sr = torchaudio.load(file)
description = "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 125" #@param ["An 80s driving pop song with heavy drums and synth pads in the background","A cheerful country song with acoustic guitars", "90s rock song with electric guitar and heavy drums", "lofi slow bpm electro chill with organic samples"] {allow-input: true}
descriptions = [description]
melody_waveform = melody_waveform.unsqueeze(0).repeat(len(descriptions), 1, 1)
output = model.generate_with_chroma(
    descriptions,
    melody_wavs=melody_waveform,
    melody_sample_rate=sr,
    progress=True, return_tokens=True
)
print("編曲")
display_audio(output[0], sample_rate=32000)

output_list = []
for tensor in output[0]:
  output_list = tensor

audio_write('/content/03_output_mp3/acp_musicgen', output_list.detach().cpu().float(), model.sample_rate,
            format="mp3", strategy="loudness")

audio_musicgen = "/content/03_output_mp3/acp_musicgen.mp3"
audio = AudioSegment.from_file(audio_musicgen)
# 音量(%)
volume_percent = 70

# 音量を調整して再生
re_audio = audio + (20 * math.log10(volume_percent/100))
re_audio.export(audio_musicgen, format='mp3')

# ボーカルと伴奏のファイルを読みだす
vocal = AudioSegment.from_file('/content/NEUTRINO/output/singingai_world_efct.wav')
acp_musicgen = AudioSegment.from_file(audio_musicgen)

# ミキシングとファイル出力
output = vocal.overlay(acp_musicgen, position=0)
output.export('/content/03_output_mp3/song_musicgen.mp3', format='mp3')
print("歌＋編曲")
IPython.display.Audio('/content/03_output_mp3/song_musicgen.mp3')

Downloading state_dict.bin:   0%|          | 0.00/2.77G [00:00<?, ?B/s]

Downloading: "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th" to /root/.cache/torch/hub/checkpoints/955717e8-8726e21a.th
100%|██████████| 80.2M/80.2M [00:00<00:00, 108MB/s]


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)ssion_state_dict.bin:   0%|          | 0.00/236M [00:00<?, ?B/s]

編曲


CLIPPING /content/03_output_mp3/acp_musicgen happening with proba (a bit of clipping is okay): 7.130456651793793e-05 maximum scale:  1.2456315755844116


歌＋編曲
