# Notes about API

* I feel every function in the referance API should be used at least once in the How-to guides.

In [9]:
from vespa.package import Document, Field

document = Document(
    fields=[
        Field(name = "id", type = "string", indexing = ["attribute", "summary"]),
        Field(name = "title", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
        Field(name = "body", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
        Field(name = "lyrics_length", type = "int", indexing = ["attribute", "summary"])
    ]
)

"""
# Questions 

    # What is indexing?, what options are available? 
    # What is index? 
    # What is "enable-bm25" and are there any other alternatives? 
        - need to have enable-bm25 to be able to use bm25 later
    # Does it have to be named "title" and "body"?
    # Difference between "attributte" and "index"?
    # What happends when i write "summary"
"""


'\n# Questions \n\n    # What is indexing?, what options are available? \n    # What is index? \n    # What is "enable-bm25" and are there any other alternatives? \n        - need to have enable-bm25 to be able to use bm25 later\n    # Does it have to be named "title" and "body"?\n'

In [10]:
document

Document([Field('id', 'string', ['attribute', 'summary'], None), Field('title', 'string', ['index', 'summary'], 'enable-bm25'), Field('body', 'string', ['index', 'summary'], 'enable-bm25'), Field('lyrics_length', 'int', ['attribute', 'summary'], None)])

nativeRank can be found here: https://docs.vespa.ai/documentation/reference/nativerank.html
bm25 can be found here: https://docs.vespa.ai/documentation/reference/bm25.html



In [11]:
from vespa.package import Schema, FieldSet, RankProfile

lyrics_schema = Schema(
    name = "lyrics",
    document = document,
    fieldsets = [FieldSet(name = "default", fields = ["title", "body", "lyrics_length"])],
    
    # Here we add the rankingprofiles, they can also be added later
    rank_profiles = [RankProfile(name = "default", first_phase = "nativeRank(title, body)"),
                    RankProfile(name = "bm25", inherits = "default", first_phase = "bm25(title)+bm25(body)")]
)

"""
# Questions 
    # Can we add fields later?
    # Can we have a second_phase?

"""


In [12]:
from vespa.package import ApplicationPackage

app_package = ApplicationPackage(name = "lyrics", schema=lyrics_schema)



In [13]:
app_package

ApplicationPackage('lyrics', Schema('lyrics', Document([Field('id', 'string', ['attribute', 'summary'], None), Field('title', 'string', ['index', 'summary'], 'enable-bm25'), Field('body', 'string', ['index', 'summary'], 'enable-bm25'), Field('lyrics_length', 'int', ['attribute', 'summary'], None)]), [FieldSet('default', ['title', 'body', 'lyrics_length'])], [RankProfile('default', 'nativeRank(title, body)', None), RankProfile('bm25', 'bm25(title)+bm25(body)', 'default')]))

In [14]:
from vespa.package import VespaCloud

            #C:\Users\User\OneDrive - NTNU\NTNU\Prosjekt oppgave NLP
path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"
file = "andre.olaisen.tmartins-ntnu.pem"


# App name in Cloud
app_name = "andre-lyrics"


vespa_cloud = VespaCloud(
    tenant="tmartins-ntnu",
    application=app_name,
    key_location=path_key + file,
    application_package=app_package
)

In [43]:
name = "lyrics_application"
path = path_key + name


app = vespa_cloud.deploy(
    instance='andre-olaisen',
    disk_folder=path_key
)



CannotSendRequest: Request-sent

In [16]:
import pandas as pd

# import data 
dataset_path = "C:/Users/User/OneDrive - NTNU/NTNU/Prosjekt oppgave NLP/dataset/Song_lyrics/"

"""
Columns
    id: int
    singer_name: string
    name (album name): string
    type: string  (EP, album, compilation, ...)
    year: int  (release year)
"""
album_details_path = dataset_path + "album_details.csv"
album_details = pd.read_csv(album_details_path)

"""
Columns
    link: string
    singer_name: string
    artist: string
    song_name: string  
    lyrics: string  
"""
song_lyrics_path = dataset_path + "lyrics.csv"
song_lyrics = pd.read_csv(song_lyrics_path)

song_details_path = dataset_path + "songs_details.csv"
song_details = pd.read_csv(song_details_path)



In [17]:
album_details.rename(columns={"Unnamed: 0": "id"}, inplace=True)

print(album_details.shape)

for  index, row in album_details.iterrows():
    artist = row["singer_name"].split(" ")
    if artist[-1] == "Lyrics":
        album_details["singer_name"][index] = ' '.join(artist[:-1])

album_details.head()



(1819, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,id,id.1,singer_name,name,type,year
0,0,5765.0,Taylor Swift Lyrics,Taylor Swift,album,2006
1,1,6432.0,Taylor Swift Lyrics,Sounds Of The Season: The Taylor Swift Holiday...,EP,2007
2,2,6995.0,Taylor Swift Lyrics,Fearless,album,2008
3,3,10358.0,Taylor Swift Lyrics,Speak Now,album,2010
4,4,24353.0,Taylor Swift Lyrics,Red,album,2012


In [18]:
# Rename column to id
song_lyrics.rename(columns={"Unnamed: 0": "id"}, inplace=True)
song_lyrics.head()

for  index, row in song_lyrics.iterrows():
    artist = row["artist"].split(" ")
    if artist[-1] == "Lyrics":
        song_lyrics["artist"][index] = ' '.join(artist[:-1])

song_lyrics.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,link,artist,song_name,lyrics
0,0,../lyrics/backstreetboys/climbingthewalls.html,Backstreet Boys,Climbing The Walls,"\n\n[Brian:]\nClose your eyes, make a wish\nTh..."
1,1,../lyrics/westlife/howtobreakaheart.html,Westlife,How To Break A Heart,\n\r\nSince you're not worth my love\nI haven'...
2,2,../lyrics/deanmartin/iwill.html,Dean Martin,I Will,\n\r\nI don't want to be the one to say I'm go...
3,3,../lyrics/deanmartin/tellyourmotherhello.html,Dean Martin,Tell Your Mother Hello,\n\r\nSure I loved the dear money that shines ...
4,4,../lyrics/deanmartin/behonestwithme.html,Dean Martin,Be Honest With Me,\n\r\nBe honest with me dear whatever you do\n...


In [19]:
song_details.rename(columns={"Unnamed: 0": "id"}, inplace=True)

for  index, row in song_details.iterrows():
    artist = row["singer_name"].split(" ")
    if artist[-1] == "Lyrics":
        song_details["singer_name"][index] = ' '.join(artist[:-1])

song_details.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,song_id,singer_name,song_name,song_href
0,0,1,Taylor Swift,Tim McGraw,../lyrics/taylorswift/timmcgraw.html
1,1,2,Taylor Swift,Picture To Burn,../lyrics/taylorswift/picturetoburn.html
2,2,3,Taylor Swift,Teardrops On My Guitar,../lyrics/taylorswift/teardropsonmyguitar.html
3,3,4,Taylor Swift,A Place In This World,../lyrics/taylorswift/aplaceinthisworld.html
4,4,5,Taylor Swift,Cold As You,../lyrics/taylorswift/coldasyou.html


In [20]:
song_lyrics["lyrics"][111]

"\n\r\nGood morning life\nGood morning sun how are your skies above\nGee it's great to be alive and in love\n\nGood morning life\nGood morning birds sing out your happy tunes\nFeels so good because I'll be seeing her soon\n\nLast night she said she loved me\nWhat a pity to part\nI slept with both eyes open waiting for today to start\n\nGood morning life\nGood morning world how are you happiness\nAll at once I know what livin' can be\nIt's life, it's free, it's someone waiting for me\nWho'll someday be my wife, good morning life\n\nGood morning life\nGood morning life\n\nLast night she said she loved me\nWhat a pity to part\nI slept with both eyes open waiting for today to start\n\nGood morning life\nGood morning world how are you happiness\nAll at once I know what livin' can be\nAh it's life, it's free, it's someone waiting for me\nWho'll someday be my wife, good morning life\n"

In [21]:
#Create a sample of the lyrics


lyrics_sample = song_lyrics.sample(n=10000, replace=False, random_state = 10)
lyrics_sample.head()


Unnamed: 0,id,link,artist,song_name,lyrics
17288,17288,../lyrics/onerepublic/liftmeup.html,OneRepublic,Lift Me Up,\n\r\nIf I told you I was down I was down woul...
10192,10192,../lyrics/snoopdogg/theoneandonly.html,Snoop Dogg,The One And Only,"\n\r\nAw yeah, coming to you live and direct f..."
4559,4559,../lyrics/bobmarley/somuchtroubleintheworld.html,Bob Marley,So Much Trouble In The World,\n\r\nSo much trouble in the world\nSo much tr...
11679,11679,../lyrics/christinaaguilera/atlast.html,Christina Aguilera,At Last,\n\r\nAt last\nMy love has come along\nMy lone...
17241,17241,../lyrics/onerepublic/mercy.html,OneRepublic,Mercy,\n\r\nAngel of mercy\nHow did you find me?\nWh...


In [22]:
lyrics_sample[1000:1001]

Unnamed: 0,id,link,artist,song_name,lyrics
8398,8398,../lyrics/rihanna/loveeeeeeesong.html,Rihanna,Loveeeeeee Song,\n\r\nAin't nothin' wrong with it\n\nI don't w...


In [23]:

%%time

## You do not have to feed data to the app every time 

i = 0
for idx, row in lyrics_sample[0:20].iterrows():
    if i %100 == 0:
        print(i)
    response = app.feed_data_point(
        schema = "lyrics",
        data_id = str(row["id"]),
            fields = {
                "id": str(row["id"]),
                "title": str(row["song_name"]),
                "body": str(row["lyrics"]),
                "lyrics_length": len(row["lyrics"])
            }
    )
    i += 1
    

# how does one clear the data?


0
Wall time: 11 s


In [24]:
lyrics_sample["lyrics"][4559]

"\n\r\nSo much trouble in the world\nSo much trouble in the world\n\nBless my eyes this morning\nJah sun is on the rise once again\nThe way earthly thin's are goin'\nAnything can happen.\n\nYou see men sailing on their ego trip,\nBlast off on their spaceship,\nMillion miles from reality:\nNo care for you, no care for me.\n\nSo much trouble in the world;\nSo much trouble in the world.\nAll you got to do: give a little (give a little),\nGive a little (give a little), give a little (give a little)!\nOne more time, ye-ah! (give a little) Ye-ah! (give a little)\nYe-ah! (give a little) Yeah!\n\nSo you think you've found the solution,\nBut it's just another illusion!\n(So before you check out this tide),\nDon't leave another cornerstone\nStanding there behind, eh-eh-eh-eh!\nWe've got to face the day;\n(Ooh) Ooh-wee, come what may:\nWe the street people talkin',\nYeah, we the people strugglin'.\n\nNow they sitting on a time bomb; (Bomb-bomb-bomb! Bomb-bomb-bomb!)\nNow I know the time has come:

In [34]:
from vespa.query import Query, MatchFilter, OR, AND, WeakAnd, ANN, , RankProfile as Ranking


results = app.query(
    query="So much trouble in the world",
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)


results._vespa_result
results.number_documents_retrieved



4847

In [None]:
??result

In [None]:
print(len(results.hits))

for result in results.hits:
    print(result['fields']['title'])

In [None]:
# add_rank_profile: What does inherits mean? Why is this needed?
# Not able to make bm25 work

app_package.schema.add_rank_profile(
    RankProfile(name = "bm25", inherits = "default", first_phase = "bm25(title)+bm25(body)")
)


In [None]:
# After adding a new RankingProfile the app has to be redeployed

path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"

app = vespa_cloud.deploy(
    instance='andre-olaisen',
    disk_folder=path_key
)


In [None]:
app_package.schema

In [26]:
### Testing different matching phases

query_text = "Ain't nothin' wrong with"

results1 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=WeakAnd(hits = 5),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)

results2 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)

results3 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=AND(),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)




In [27]:
print(results1.number_documents_retrieved)
print(results2.number_documents_retrieved)
print(results3.number_documents_retrieved)

127
3126
13


In [28]:
# Size of the corpus?
print(results1.number_documents_indexed)
print(results2.number_documents_indexed)
print(results3.number_documents_indexed)

4999
4999
4999


In [32]:
query_text = "The One And Only"

results_or_default = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="default")
    ),
    hits = 5
)

results_or_bm25 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="bm25")
    ),
    hits = 5
)

print(results_or_default.number_documents_retrieved)
print(results_or_bm25.number_documents_retrieved)

4853
4853


In [33]:
print(len(results.hits))
print(query_text)
print("\n")

print("Results: or , deault")
for result in results_or_default.hits:
    print(result['fields']['title'])
    print(result['fields'])
    print(result["relevance"])
    
print("\n")
    
print("Results: OR , bm25(title) + bm25(body)")
for result in results_or_bm25.hits:
    print(result['fields']['title'])
    print(result["relevance"])



10
The One And Only


Results: or , deault
The One And Only
{'sddocname': 'lyrics', 'documentid': 'id:lyrics:lyrics::10192', 'id': '10192', 'title': 'The One And Only', 'body': "\n\r\nAw yeah, coming to you live and direct from the LBC\r\nWe have the one and only, Snoop D-O double G\r\nYeah, yeah, drop it\n\n[scratching]\r\nIt's the one and only D-O double G\r\nBig Snoop Dogg, it's the one and only\r\nThe one and only, D-O double-double-double G\n\n[Snoop Dogg]\r\nBig Snoop Dogg\r\nYou in the presence of a motherfucking rap star\r\nI push up laid back in a black car\r\nThough I bossed up, it ain't hard to tell that\r\nI came up hard as hell, check it out\r\nI stayed sharp and played my part\r\nAll I had was a mic, a dream and some heart\r\nMe and my moms wasn't getting along at this time\r\nAnd since pops was gone, I'm out grinding\r\nCatch a nigga praying, swearing I ain't going back to jail\r\nJudge about tired of a playa, I don't know about this bidness shit\r\nBut I'm good with thi

In [None]:
results1.hits[0]["relevance"] 

In [None]:
bm25_ranking = Query(
    match_phase=OR(),
    rank_profile=Ranking(name="bm25")
)


In [None]:
def long_text(text):
    return(len(str(text)))
    

app_package.schema.add_rank_profile(
    RankProfile(name = "long", inherits = "default", first_phase = sqrt(lyrics_length))
)



In [None]:
# After adding a new RankingProfile the app has to be redeployed

path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"

app = vespa_cloud.deploy(
    instance='andre-olaisen',
    disk_folder=path_key
)


In [None]:
query_text = "led zeppelin"

results_or_long = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="long")
    ),
    hits = 5
)

print(results_or_long.number_documents_retrieved)
print(results_or_long.number_documents_indexed)

In [45]:
print("Results: OR , long(body) ")
for result in results_or_long.hits:
    print(result['fields']['title'])
    print(result["relevance"])

Results: OR , long(body) 


NameError: name 'results_or_long' is not defined

In [40]:
# How do i use ANN?
# can read about it here: https://docs.vespa.ai/documentation/tutorials/text-search-semantic.html

results = app.query(
    query=query_text,
    query_model = Query(
        match_phase=ANN(query_vector = query_text,
                        embedding_model = "bert",
                        doc_vector = "body",
                       hits = 10,
                       label = ),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)

TypeError: __init__() missing 1 required positional argument: 'label'