<a href="https://colab.research.google.com/github/programminghistorian/jekyll/blob/Issue-3052/assets/corpus-analysis-with-spacy/corpus-analysis-with-spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Corpus Analysis with spaCy


### Installing, Importing and Preprocessing

In [1]:
!pip install spaCy
!pip install plotly
%pip install nbformat --upgrade


Note: you may need to restart the kernel to use updated packages.


In [2]:
import spacy
!spacy download en_core_web_sm

import os
from spacy import displacy


import pandas as pd
pd.options.mode.chained_assignment = None  


import plotly.graph_objects as go
import plotly.express as px

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [54]:
texts = []
file_names = []

for _file_name in os.listdir('rap_lyrics'):
    if _file_name.endswith('.txt'):   
        texts.append(open('rap_lyrics' + '/' + _file_name, 'r', encoding='latin-1').read())        
        file_names.append(_file_name)

In [55]:
d = {'Filename':file_names,'Text':texts}

In [56]:
lyrics_df = pd.DataFrame(d)

In [57]:
lyrics_df.head()

Unnamed: 0,Filename,Text
0,Talib Kweli_lyrics.txt,\nWe sell crack to our own out the back of our...
1,CunninLynguists_lyrics.txt,\nLove ain't for the faint of heart\nStart tra...
2,Kanye West_lyrics.txt,"\nWell, it is a weepin' and a moanin' and a gn..."
3,Deniro Farrar_lyrics.txt,\nÂ­\n\nLet me give you a little inside inform...
4,Eminem_lyrics.txt,"\n""Look, I was gonna go easy on you, and not t..."


In [58]:
lyrics_df['Text'] = lyrics_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
lyrics_df.head()

Unnamed: 0,Filename,Text
0,Talib Kweli_lyrics.txt,We sell crack to our own out the back of our h...
1,CunninLynguists_lyrics.txt,Love ain't for the faint of heart Start traini...
2,Kanye West_lyrics.txt,"Well, it is a weepin' and a moanin' and a gnas..."
3,Deniro Farrar_lyrics.txt,Â­ Let me give you a little inside information...
4,Eminem_lyrics.txt,"""Look, I was gonna go easy on you, and not to ..."


In [59]:
metadata_df = pd.read_csv('metadata.csv')
metadata_df.head()

Unnamed: 0,Artist,File
0,Talib Kweli,Talib Kweli_lyrics.txt
1,CunninLynguists,CunninLynguists_lyrics.txt
2,Kanye West,Kanye West_lyrics.txt
3,Deniro Farrar,Deniro Farrar_lyrics.txt
4,Eminem,Eminem_lyrics.txt


In [60]:
lyrics_df['Filename'] = lyrics_df['Filename'].str.replace('.txt', '', regex=True)

metadata_df.rename(columns={"lyrics ID": "Filename"}, inplace=True)

In [62]:
print(metadata_df.columns)
print(lyrics_df.columns)

Index(['Artist', 'File'], dtype='object')
Index(['Filename', 'Text'], dtype='object')


In [63]:
merged_df = metadata_df.merge(lyrics_df, left_on='File', right_on='Filename')

In [64]:
lyrics_df.head()

Unnamed: 0,Filename,Text
0,Talib Kweli_lyrics,We sell crack to our own out the back of our h...
1,CunninLynguists_lyrics,Love ain't for the faint of heart Start traini...
2,Kanye West_lyrics,"Well, it is a weepin' and a moanin' and a gnas..."
3,Deniro Farrar_lyrics,Â­ Let me give you a little inside information...
4,Eminem_lyrics,"""Look, I was gonna go easy on you, and not to ..."


## Text Enrichment with spaCy

### Text Reduction

In [66]:
nlp = spacy.load('en_core_web_sm')

def get_token(lyrics_text):
    
    doc = nlp(lyrics_text)
    return [token.text for token in doc]

lyrics_df['Tokens'] = lyrics_df['Text'].apply(get_token)

lyrics_df.head()


Unnamed: 0,Filename,Text,Tokens
0,Talib Kweli_lyrics,We sell crack to our own out the back of our h...,"[We, sell, crack, to, our, own, out, the, back..."
1,CunninLynguists_lyrics,Love ain't for the faint of heart Start traini...,"[Love, ai, n't, for, the, faint, of, heart, St..."
2,Kanye West_lyrics,"Well, it is a weepin' and a moanin' and a gnas...","[Well, ,, it, is, a, weepin, ', and, a, moanin..."
3,Deniro Farrar_lyrics,Â­ Let me give you a little inside information...,"[Â­, Let, me, give, you, a, little, inside, in..."
4,Eminem_lyrics,"""Look, I was gonna go easy on you, and not to ...","["", Look, ,, I, was, gon, na, go, easy, on, yo..."


In [67]:
tokens = lyrics_df[['Text', 'Tokens']].copy()
tokens.head()

Unnamed: 0,Text,Tokens
0,We sell crack to our own out the back of our h...,"[We, sell, crack, to, our, own, out, the, back..."
1,Love ain't for the faint of heart Start traini...,"[Love, ai, n't, for, the, faint, of, heart, St..."
2,"Well, it is a weepin' and a moanin' and a gnas...","[Well, ,, it, is, a, weepin, ', and, a, moanin..."
3,Â­ Let me give you a little inside information...,"[Â­, Let, me, give, you, a, little, inside, in..."
4,"""Look, I was gonna go easy on you, and not to ...","["", Look, ,, I, was, gon, na, go, easy, on, yo..."


#### Lemmatization



In [19]:
def get_lemma(tokens):
    lemmas = [token.lemma_ for token in nlp(' '.join(tokens))]
    return lemmas

lyrics_df['Lemmas'] = lyrics_df['Tokens'].apply(get_lemma)

print(f'"money" appears in the text tokens column ' + str(lyrics_df['Tokens'].apply(lambda x: x.count('write')).sum()) + ' times.')
print(f'"money" appears in the lemmas column ' + str(lyrics_df['Lemmas'].apply(lambda x: x.count('write')).sum()) + ' times.')


"money" appears in the text tokens column 242 times.
"money" appears in the lemmas column 587 times.


### Text Annotation

In [98]:
lyrics_df = pd.DataFrame(d)

lyrics_df['Text'] = lyrics_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()


def process_batch(texts):    
    results = []

    for text in texts:        
        doc = nlp(text)                
        pos_tags = [(token.pos_, token.tag_) for token in doc]
               
        results.append(pos_tags)
    
    return results


batch_size = 50

num_batches = len(lyrics_df) // batch_size + 1
final_results = []


for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(lyrics_df))
        
    current_batch = lyrics_df['Text'].iloc[start_idx:end_idx]

    batch_results = process_batch(current_batch)
    

    final_results.extend(batch_results)


lyrics_df['POS'] = final_results

lyrics_df.head()

Unnamed: 0,Filename,Text,POS
0,Talib Kweli_lyrics.txt,We sell crack to our own out the back of our h...,"[(PRON, PRP), (VERB, VBP), (NOUN, NN), (ADP, I..."
1,CunninLynguists_lyrics.txt,Love ain't for the faint of heart Start traini...,"[(NOUN, NN), (VERB, VBP), (PART, RB), (ADP, IN..."
2,Kanye West_lyrics.txt,"Well, it is a weepin' and a moanin' and a gnas...","[(INTJ, UH), (PUNCT, ,), (PRON, PRP), (AUX, VB..."
3,Deniro Farrar_lyrics.txt,Â­ Let me give you a little inside information...,"[(INTJ, UH), (VERB, VB), (PRON, PRP), (VERB, V..."
4,Eminem_lyrics.txt,"""Look, I was gonna go easy on you, and not to ...","[(PUNCT, ``), (VERB, VB), (PUNCT, ,), (PRON, P..."


#### Named Entity Recognition



In [102]:
labels = nlp.get_pipe("ner").labels


for label in labels:
    print(label + ' : ' + spacy.explain(label))

CARDINAL : Numerals that do not fall under another type
DATE : Absolute or relative dates or periods
EVENT : Named hurricanes, battles, wars, sports events, etc.
FAC : Buildings, airports, highways, bridges, etc.
GPE : Countries, cities, states
LANGUAGE : Any named language
LAW : Named documents made into laws.
LOC : Non-GPE locations, mountain ranges, bodies of water
MONEY : Monetary values, including unit
NORP : Nationalities or religious or political groups
ORDINAL : "first", "second", etc.
ORG : Companies, agencies, institutions, etc.
PERCENT : Percentage, including "%"
PERSON : People, including fictional
PRODUCT : Objects, vehicles, foods, etc. (not services)
QUANTITY : Measurements, as of weight or distance
TIME : Times smaller than a day
WORK_OF_ART : Titles of books, songs, etc.


In [103]:
import spacy
import pandas as pd


nlp = spacy.load('en_core_web_sm')


def extract_named_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

lyrics_df['Named_Entities'] = lyrics_df['Text'].apply(extract_named_entities)

lyrics_df.head()

Unnamed: 0,Filename,Text,POS,Sentiment_Analysis,Named_Entities
0,Talib Kweli_lyrics.txt,We sell crack to our own out the back of our h...,"[(PRON, PRP), (VERB, VBP), (NOUN, NN), (ADP, I...",0.080774,"[(the Clones Work ', ORG), (Norman Mailer Mi, ..."
1,CunninLynguists_lyrics.txt,Love ain't for the faint of heart Start traini...,"[(NOUN, NN), (VERB, VBP), (PART, RB), (ADP, IN...",0.040427,"[(Visits, PRODUCT), (Love, WORK_OF_ART), (Love..."
2,Kanye West_lyrics.txt,"Well, it is a weepin' and a moanin' and a gnas...","[(INTJ, UH), (PUNCT, ,), (PRON, PRP), (AUX, VB...",0.047397,"[(Believe, ORG), (two, CARDINAL), (Lambo, PERS..."
3,Deniro Farrar_lyrics.txt,Â­ Let me give you a little inside information...,"[(INTJ, UH), (VERB, VB), (PRON, PRP), (VERB, V...",-0.029976,"[(Nigga, PERSON), (36, CARDINAL), (OG, ORG), (..."
4,Eminem_lyrics.txt,"""Look, I was gonna go easy on you, and not to ...","[(PUNCT, ``), (VERB, VB), (PUNCT, ,), (PRON, P...",-0.03367,"[(one, CARDINAL), (Six minutes, TIME), (Six mi..."


In [104]:
def extract_identified_entities(text):
    doc = nlp(text)
        
    identified_entities = [token.text for token in doc if token.ent_type_]
    
    return identified_entities

lyrics_df['Identified_Entities'] = lyrics_df['Text'].apply(extract_identified_entities)


lyrics_df.head()

Unnamed: 0,Filename,Text,POS,Sentiment_Analysis,Named_Entities,Identified_Entities
0,Talib Kweli_lyrics.txt,We sell crack to our own out the back of our h...,"[(PRON, PRP), (VERB, VBP), (NOUN, NN), (ADP, I...",0.080774,"[(the Clones Work ', ORG), (Norman Mailer Mi, ...","[the, Clones, Work, ', Norman, Mailer, Mi, thr..."
1,CunninLynguists_lyrics.txt,Love ain't for the faint of heart Start traini...,"[(NOUN, NN), (VERB, VBP), (PART, RB), (ADP, IN...",0.040427,"[(Visits, PRODUCT), (Love, WORK_OF_ART), (Love...","[Visits, Love, Love'll, Brain, Studderin, four..."
2,Kanye West_lyrics.txt,"Well, it is a weepin' and a moanin' and a gnas...","[(INTJ, UH), (PUNCT, ,), (PRON, PRP), (AUX, VB...",0.047397,"[(Believe, ORG), (two, CARDINAL), (Lambo, PERS...","[Believe, two, Lambo, two, Lambo, Lamborghini,..."
3,Deniro Farrar_lyrics.txt,Â­ Let me give you a little inside information...,"[(INTJ, UH), (VERB, VB), (PRON, PRP), (VERB, V...",-0.029976,"[(Nigga, PERSON), (36, CARDINAL), (OG, ORG), (...","[Nigga, 36, OG, Nigga, 16, Nigga, Denzel, Univ..."
4,Eminem_lyrics.txt,"""Look, I was gonna go easy on you, and not to ...","[(PUNCT, ``), (VERB, VB), (PUNCT, ,), (PRON, P...",-0.03367,"[(one, CARDINAL), (Six minutes, TIME), (Six mi...","[one, Six, minutes, Six, minutes, Slim, Shady,..."


In [105]:
from textblob import TextBlob


def perform_sentiment_analysis(text):
    
    blob = TextBlob(text)
    
    
    sentiment_polarity = blob.sentiment.polarity
    
    return sentiment_polarity


lyrics_df['Sentiment_Analysis'] = lyrics_df['Text'].apply(perform_sentiment_analysis)

lyrics_df.head()


Unnamed: 0,Filename,Text,POS,Sentiment_Analysis,Named_Entities,Identified_Entities
0,Talib Kweli_lyrics.txt,We sell crack to our own out the back of our h...,"[(PRON, PRP), (VERB, VBP), (NOUN, NN), (ADP, I...",0.080774,"[(the Clones Work ', ORG), (Norman Mailer Mi, ...","[the, Clones, Work, ', Norman, Mailer, Mi, thr..."
1,CunninLynguists_lyrics.txt,Love ain't for the faint of heart Start traini...,"[(NOUN, NN), (VERB, VBP), (PART, RB), (ADP, IN...",0.040427,"[(Visits, PRODUCT), (Love, WORK_OF_ART), (Love...","[Visits, Love, Love'll, Brain, Studderin, four..."
2,Kanye West_lyrics.txt,"Well, it is a weepin' and a moanin' and a gnas...","[(INTJ, UH), (PUNCT, ,), (PRON, PRP), (AUX, VB...",0.047397,"[(Believe, ORG), (two, CARDINAL), (Lambo, PERS...","[Believe, two, Lambo, two, Lambo, Lamborghini,..."
3,Deniro Farrar_lyrics.txt,Â­ Let me give you a little inside information...,"[(INTJ, UH), (VERB, VB), (PRON, PRP), (VERB, V...",-0.029976,"[(Nigga, PERSON), (36, CARDINAL), (OG, ORG), (...","[Nigga, 36, OG, Nigga, 16, Nigga, Denzel, Univ..."
4,Eminem_lyrics.txt,"""Look, I was gonna go easy on you, and not to ...","[(PUNCT, ``), (VERB, VB), (PUNCT, ,), (PRON, P...",-0.03367,"[(one, CARDINAL), (Six minutes, TIME), (Six mi...","[one, Six, minutes, Six, minutes, Slim, Shady,..."


In [107]:
from sklearn.feature_extraction.text import CountVectorizer


def most_frequent_words_analysis(text):
    
    vectorizer = CountVectorizer()
    
    
    X = vectorizer.fit_transform([text])
    
  
    feature_names = vectorizer.get_feature_names_out()
    
    word_counts = X.toarray().flatten()
    
    word_counts_dict = dict(zip(feature_names, word_counts))
    
    sorted_word_counts = sorted(word_counts_dict.items(), key=lambda x: x[1], reverse=True)
    
    most_frequent_words = [word for word, count in sorted_word_counts[:10]]  # Adjust the number as needed
    
    return most_frequent_words

lyrics_df['Most_Frequent_Words'] = lyrics_df['Text'].apply(most_frequent_words_analysis)

lyrics_df.head()


Unnamed: 0,Filename,Text,POS,Sentiment_Analysis,Named_Entities,Identified_Entities,Most_Frequent_Words
0,Talib Kweli_lyrics.txt,We sell crack to our own out the back of our h...,"[(PRON, PRP), (VERB, VBP), (NOUN, NN), (ADP, I...",0.080774,"[(the Clones Work ', ORG), (Norman Mailer Mi, ...","[the, Clones, Work, ', Norman, Mailer, Mi, thr...","[the, you, to, it, and, of, in, we, like, is]"
1,CunninLynguists_lyrics.txt,Love ain't for the faint of heart Start traini...,"[(NOUN, NN), (VERB, VBP), (PART, RB), (ADP, IN...",0.040427,"[(Visits, PRODUCT), (Love, WORK_OF_ART), (Love...","[Visits, Love, Love'll, Brain, Studderin, four...","[the, to, and, you, in, my, it, of, that, with]"
2,Kanye West_lyrics.txt,"Well, it is a weepin' and a moanin' and a gnas...","[(INTJ, UH), (PUNCT, ,), (PRON, PRP), (AUX, VB...",0.047397,"[(Believe, ORG), (two, CARDINAL), (Lambo, PERS...","[Believe, two, Lambo, two, Lambo, Lamborghini,...","[the, you, and, to, it, that, my, me, in, all]"
3,Deniro Farrar_lyrics.txt,Â­ Let me give you a little inside information...,"[(INTJ, UH), (VERB, VB), (PRON, PRP), (VERB, V...",-0.029976,"[(Nigga, PERSON), (36, CARDINAL), (OG, ORG), (...","[Nigga, 36, OG, Nigga, 16, Nigga, Denzel, Univ...","[the, my, to, and, you, me, nigga, it, in, that]"
4,Eminem_lyrics.txt,"""Look, I was gonna go easy on you, and not to ...","[(PUNCT, ``), (VERB, VB), (PUNCT, ,), (PRON, P...",-0.03367,"[(one, CARDINAL), (Six minutes, TIME), (Six mi...","[one, Six, minutes, Six, minutes, Slim, Shady,...","[you, the, to, and, it, my, me, in, that, of]"


In [113]:
lyrics_df['Lyrics_Length'] = lyrics_df['Text'].apply(len)

lyrics_df.head()


Unnamed: 0,Filename,Text,POS,Sentiment_Analysis,Named_Entities,Identified_Entities,Most_Frequent_Words,Lyrics_Length
0,Talib Kweli_lyrics.txt,We sell crack to our own out the back of our h...,"[(PRON, PRP), (VERB, VBP), (NOUN, NN), (ADP, I...",0.080774,"[(the Clones Work ', ORG), (Norman Mailer Mi, ...","[the, Clones, Work, ', Norman, Mailer, Mi, thr...","[the, you, to, it, and, of, in, we, like, is]",194625
1,CunninLynguists_lyrics.txt,Love ain't for the faint of heart Start traini...,"[(NOUN, NN), (VERB, VBP), (PART, RB), (ADP, IN...",0.040427,"[(Visits, PRODUCT), (Love, WORK_OF_ART), (Love...","[Visits, Love, Love'll, Brain, Studderin, four...","[the, to, and, you, in, my, it, of, that, with]",156635
2,Kanye West_lyrics.txt,"Well, it is a weepin' and a moanin' and a gnas...","[(INTJ, UH), (PUNCT, ,), (PRON, PRP), (AUX, VB...",0.047397,"[(Believe, ORG), (two, CARDINAL), (Lambo, PERS...","[Believe, two, Lambo, two, Lambo, Lamborghini,...","[the, you, and, to, it, that, my, me, in, all]",183625
3,Deniro Farrar_lyrics.txt,Â­ Let me give you a little inside information...,"[(INTJ, UH), (VERB, VB), (PRON, PRP), (VERB, V...",-0.029976,"[(Nigga, PERSON), (36, CARDINAL), (OG, ORG), (...","[Nigga, 36, OG, Nigga, 16, Nigga, Denzel, Univ...","[the, my, to, and, you, me, nigga, it, in, that]",151715
4,Eminem_lyrics.txt,"""Look, I was gonna go easy on you, and not to ...","[(PUNCT, ``), (VERB, VB), (PUNCT, ,), (PRON, P...",-0.03367,"[(one, CARDINAL), (Six minutes, TIME), (Six mi...","[one, Six, minutes, Six, minutes, Slim, Shady,...","[you, the, to, and, it, my, me, in, that, of]",290335


In [122]:
import pandas as pd

metadata_df = pd.read_csv('metadata.csv')

new_column_data = metadata_df['Artist']

lyrics_df.insert(1, 'Artist', new_column_data)

In [126]:
print(lyrics_df.head())


                     Filename           Artist  \
0      Talib Kweli_lyrics.txt      Talib Kweli   
1  CunninLynguists_lyrics.txt  CunninLynguists   
2       Kanye West_lyrics.txt       Kanye West   
3    Deniro Farrar_lyrics.txt    Deniro Farrar   
4           Eminem_lyrics.txt           Eminem   

                                                Text  \
0  We sell crack to our own out the back of our h...   
1  Love ain't for the faint of heart Start traini...   
2  Well, it is a weepin' and a moanin' and a gnas...   
3  Â­ Let me give you a little inside information...   
4  "Look, I was gonna go easy on you, and not to ...   

                                                 POS  Sentiment_Analysis  \
0  [(PRON, PRP), (VERB, VBP), (NOUN, NN), (ADP, I...            0.080774   
1  [(NOUN, NN), (VERB, VBP), (PART, RB), (ADP, IN...            0.040427   
2  [(INTJ, UH), (PUNCT, ,), (PRON, PRP), (AUX, VB...            0.047397   
3  [(INTJ, UH), (VERB, VB), (PRON, PRP), (VERB, V...      

### Download Enriched Dataset



In [127]:
from IPython.display import FileLink

lyrics_df.to_csv('raplyrics_with_tags.csv', index=False)

FileLink(r'raplyrics_with_tags.csv')

