<a href="https://colab.research.google.com/github/programminghistorian/jekyll/blob/Issue-3052/assets/corpus-analysis-with-spacy/corpus-analysis-with-spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Corpus Analysis with spaCy


### Installing, Importing and Preprocessing

In [5]:
!pip install spaCy
!pip install plotly
%pip install nbformat --upgrade


Note: you may need to restart the kernel to use updated packages.


In [6]:
import spacy

# Install English language model
!spacy download en_core_web_sm

# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.graph_objects as go
import plotly.express as px

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
# Create empty lists for file names and contents
texts = []
file_names = []

# Iterate through each file in the folder
for _file_name in os.listdir('rap_lyrics'):
# Look for only text files
    if _file_name.endswith('.txt'):
    # Append contents of each text file to text list
        texts.append(open('rap_lyrics' + '/' + _file_name, 'r', encoding='latin-1').read())
        # Append name of each file to file name list
        file_names.append(_file_name)

In [8]:

# Create dictionary object associating each file name with its text
d = {'Filename':file_names,'Text':texts}

In [9]:
# Turn dictionary into a dataframe
lyrics_df = pd.DataFrame(d)

In [10]:
lyrics_df.head()

Unnamed: 0,Filename,Text
0,Talib Kweli_lyrics.txt,\nWe sell crack to our own out the back of our...
1,CunninLynguists_lyrics.txt,\nLove ain't for the faint of heart\nStart tra...
2,Kanye West_lyrics.txt,"\nWell, it is a weepin' and a moanin' and a gn..."
3,Deniro Farrar_lyrics.txt,\nÂ­\n\nLet me give you a little inside inform...
4,Eminem_lyrics.txt,"\n""Look, I was gonna go easy on you, and not t..."


The beginnings of some of the texts may contain extra spaces (indicated by \t or \n). These characters can be replaced by a single space using the str.replace() method.

In [11]:
# Remove extra spaces from papers
lyrics_df['Text'] = lyrics_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
lyrics_df.head()

Unnamed: 0,Filename,Text
0,Talib Kweli_lyrics.txt,We sell crack to our own out the back of our h...
1,CunninLynguists_lyrics.txt,Love ain't for the faint of heart Start traini...
2,Kanye West_lyrics.txt,"Well, it is a weepin' and a moanin' and a gnas..."
3,Deniro Farrar_lyrics.txt,Â­ Let me give you a little inside information...
4,Eminem_lyrics.txt,"""Look, I was gonna go easy on you, and not to ..."


In [12]:
# Load metadata.
metadata_df = pd.read_csv('metadata.csv')
metadata_df.head()

Unnamed: 0,Artist,File
0,Talib Kweli,Talib Kweli_lyrics.txt
1,CunninLynguists,CunninLynguists_lyrics.txt
2,Kanye West,Kanye West_lyrics.txt
3,Deniro Farrar,Deniro Farrar_lyrics.txt
4,Eminem,Eminem_lyrics.txt


In [13]:
# Remove .txt from title of each paper
lyrics_df['Filename'] = lyrics_df['Filename'].str.replace('.txt', '', regex=True)

# Rename column from paper ID to Title
metadata_df.rename(columns={"lyrics ID": "Filename"}, inplace=True)

In [14]:
# 输出 metadata_df 的列名
print(metadata_df.columns)

# 输出 lyrics_df 的列名
print(lyrics_df.columns)


Index(['Artist', 'File'], dtype='object')
Index(['Filename', 'Text'], dtype='object')


In [15]:
# Merge metadata and lyrics_df into new DataFrame
# Will only keep rows where both metadata and lyrics_df are present
merged_df = metadata_df.merge(lyrics_df, left_on='File', right_on='Filename')


Let's check the head of the DataFrame again to confirm everything has worked well. Check the first five rows to make sure each has a filename, title, discipline, paper type and text (the full paper)

In [16]:
# Print DataFrame
lyrics_df.head()

Unnamed: 0,Filename,Text
0,Talib Kweli_lyrics,We sell crack to our own out the back of our h...
1,CunninLynguists_lyrics,Love ain't for the faint of heart Start traini...
2,Kanye West_lyrics,"Well, it is a weepin' and a moanin' and a gnas..."
3,Deniro Farrar_lyrics,Â­ Let me give you a little inside information...
4,Eminem_lyrics,"""Look, I was gonna go easy on you, and not to ..."


The resulting DataFrame is now ready for analysis.

## Text Enrichment with spaCy

### Creating Doc Objects



In [17]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

# Check what functions it performs
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [18]:
#Define example sentence
sentence = "This is 'an' example? sentence"

# Call the nlp model on the sentence
doc = nlp(sentence)

In [19]:
# Loop through each token in doc object
for token in doc:
    # Print text and part of speech for each
    print(token.text, token.pos_)

This PRON
is AUX
' PUNCT
an DET
' PUNCT
example NOUN
? PUNCT
sentence NOUN


In [20]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [25]:
lyrics_df['Doc'] = lyrics_df['Text'][:100].apply(process_text)


In [26]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each student essay
lyrics_df['Doc'] = lyrics_df['Text'].apply(process_text)

In [27]:
print(lyrics_df)

                     Filename  \
0          Talib Kweli_lyrics   
1      CunninLynguists_lyrics   
2           Kanye West_lyrics   
3        Deniro Farrar_lyrics   
4               Eminem_lyrics   
5             Ice Cube_lyrics   
6       Kendrick Lamar_lyrics   
7                Logic_lyrics   
8               Tupac1_lyrics   
9               J Cole_lyrics   
10                  NF_lyrics   
11              Eazy-E_lyrics   
12         Joey Badass_lyrics   
13              Common_lyrics   
14            Scarface_lyrics   
15       Isaiah Rashad_lyrics   
16         Royce Da 59_lyrics   
17               Jay-z_lyrics   
18   Chance The Rapper_lyrics   
19  Immortal Technique_lyrics   
20         Lupe Fiasco_lyrics   
21               Big L_lyrics   
22      Montana of 300_lyrics   
23          ASAP Rocky_lyrics   
24                 Bas_lyrics   
25   The Notorious BIG_lyrics   
26     Earl Sweatshirt_lyrics   
27           Lil Wayne_lyrics   
28            ASAP Ant_lyrics   
29        

### Text Reduction

#### Tokenization

A critical first step spaCy performs is tokenization, or the segmentation of strings into individual words and punctuation markers. Tokenization enables spaCy to parse the grammatical structures of a text and identify characteristics of each word-like part-of-speech.

To retrieve a tokenized version of each text in the DataFrame, we’ll write a function that iterates through any given Doc object and returns all functions found within it.

In [28]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

In [29]:
# Run the token retrieval function on the doc objects in the dataframe
lyrics_df['Tokens'] = lyrics_df['Doc'].apply(get_token)
lyrics_df.head()

Unnamed: 0,Filename,Text,Doc,Tokens
0,Talib Kweli_lyrics,We sell crack to our own out the back of our h...,"(We, sell, crack, to, our, own, out, the, back...","[We, sell, crack, to, our, own, out, the, back..."
1,CunninLynguists_lyrics,Love ain't for the faint of heart Start traini...,"(Love, ai, n't, for, the, faint, of, heart, St...","[Love, ai, n't, for, the, faint, of, heart, St..."
2,Kanye West_lyrics,"Well, it is a weepin' and a moanin' and a gnas...","(Well, ,, it, is, a, weepin, ', and, a, moanin...","[Well, ,, it, is, a, weepin, ', and, a, moanin..."
3,Deniro Farrar_lyrics,Â­ Let me give you a little inside information...,"(Â­, Let, me, give, you, a, little, inside, in...","[Â­, Let, me, give, you, a, little, inside, in..."
4,Eminem_lyrics,"""Look, I was gonna go easy on you, and not to ...","("", Look, ,, I, was, gon, na, go, easy, on, yo...","["", Look, ,, I, was, gon, na, go, easy, on, yo..."


In [30]:
tokens = lyrics_df[['Text', 'Tokens']].copy()
tokens.head()

Unnamed: 0,Text,Tokens
0,We sell crack to our own out the back of our h...,"[We, sell, crack, to, our, own, out, the, back..."
1,Love ain't for the faint of heart Start traini...,"[Love, ai, n't, for, the, faint, of, heart, St..."
2,"Well, it is a weepin' and a moanin' and a gnas...","[Well, ,, it, is, a, weepin, ', and, a, moanin..."
3,Â­ Let me give you a little inside information...,"[Â­, Let, me, give, you, a, little, inside, in..."
4,"""Look, I was gonna go easy on you, and not to ...","["", Look, ,, I, was, gon, na, go, easy, on, yo..."


#### Lemmatization



In [31]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
lyrics_df['Lemmas'] = lyrics_df['Doc'].apply(get_lemma)

Lemmatization can help reduce noise and refine results for researchers who are conducting keyword searches. For example, let’s compare counts of the word “write” in the original Tokens column and in the lemmatized Lemmas column.

In [32]:
print(f'"money" appears in the text tokens column ' + str(lyrics_df['Tokens'].apply(lambda x: x.count('write')).sum()) + ' times.')
print(f'"money" appears in the lemmas column ' + str(lyrics_df['Lemmas'].apply(lambda x: x.count('write')).sum()) + ' times.')

"money" appears in the text tokens column 242 times.
"money" appears in the lemmas column 585 times.


### Text Annotation

In [33]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
lyrics_df['POS'] = lyrics_df['Doc'].apply(get_pos)

In [34]:
# Create a list of part of speech tags
list(lyrics_df['POS'])

[[('PRON', 'PRP'),
  ('VERB', 'VBP'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('PRON', 'PRP$'),
  ('ADJ', 'JJ'),
  ('ADP', 'RP'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('PRON', 'PRP$'),
  ('NOUN', 'NNS'),
  ('PRON', 'PRP'),
  ('VERB', 'VBP'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('PRON', 'PRP'),
  ('VERB', 'VBP'),
  ('ADP', 'IN'),
  ('NOUN', 'NNS'),
  ('ADV', 'RB'),
  ('PUNCT', ','),
  ('ADP', 'IN'),
  ('PROPN', 'NNP'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('SCONJ', 'IN'),
  ('PRON', 'PRP'),
  ('VERB', 'VBP'),
  ('PRON', 'PRP$'),
  ('NOUN', 'NN'),
  ('CCONJ', 'CC'),
  ('PRON', 'PRP'),
  ('VERB', 'VBP'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADV', 'RB'),
  ('PART', 'TO'),
  ('VERB', 'VB'),
  ('ADP', 'RP'),
  ('

Fortunately, spaCy has a built-in function called explain that can provide a short description of any tag of interest. If we try it on the tag IN using spacy.explain("IN"), the output reads conjunction, subordinating or preposition.

In [35]:
spacy.explain("IN")

'conjunction, subordinating or preposition'

In some cases, you may want to get only a set of part-of-speech tags for further analysis, like all of the proper nouns. A function can be written to perform this task, extracting only words which have been fitted with the proper noun tag.

In [36]:
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
lyrics_df['Proper_Nouns'] = lyrics_df['Doc'].apply(extract_proper_nouns)

In [39]:
print(len(lyrics_df))


38


In [40]:
lyrics_df.iloc[:, lyrics_df.columns.get_loc('Proper_Nouns')]


0     [Attack, Clones, Work, ', Norman, Mailer, Mi, ...
1     [Cause, Love, Love'll, Brain, Studderin, Furth...
2     [Believe, Lamborghini, Mercy, I, Lambo, Lambor...
3     [God, Nigga, OG, niggas, Nigga, New, god, Nigg...
4     [Minutes, Slim, Shady, doc, Rap, God, Rap, God...
5     [yeahâit, Y'all, N.W.A, âIt, dopeââ, â...
6     [Waaaaay, Ayy, Finesse, countin, Parmesan, D'U...
7     [Woo, wanna, wanna, Woo, lemme, God, Woo, Pain...
8     [Happiness, tha, Walk, tha, tha, Niggas, Gette...
9     [Uncle, Phil, Prophecies, Ville, Martin, Luthe...
10    [Le, Lookin, Paranoia, Shoulda, itâmy, butâ...
11    [Gangsta, Dresta, O.G., west, coast, shit, E, ...
12    [Niggas, warâI'm, martian, Brooklyn, P, -, Y...
13    [Doo, doo, doo, diggy, doo, yo, Special, luh, ...
14    [Mmm, Pictures, hell, Hustlers, Sellin, Wishin...
15    [Kaio, Ken, LA, Pour, Bowl, niggas, Hill, clif...
16    [Ohh, dyin, ahh, Parker, Lewis, Kubiak, hyenas...
17    [warnin, warnin, Hang, warnin, warnin, Sip

#### Named Entity Recognition

spaCy can tag named entities in the text, such as names, dates, organizations, and locations. Call the full list of named entities and their descriptions using this code:

In [41]:
# Get all NE labels and assign to variable
labels = nlp.get_pipe("ner").labels

# Print each label and its description
for label in labels:
    print(label + ' : ' + spacy.explain(label))

CARDINAL : Numerals that do not fall under another type
DATE : Absolute or relative dates or periods
EVENT : Named hurricanes, battles, wars, sports events, etc.
FAC : Buildings, airports, highways, bridges, etc.
GPE : Countries, cities, states
LANGUAGE : Any named language
LAW : Named documents made into laws.
LOC : Non-GPE locations, mountain ranges, bodies of water
MONEY : Monetary values, including unit
NORP : Nationalities or religious or political groups
ORDINAL : "first", "second", etc.
ORG : Companies, agencies, institutions, etc.
PERCENT : Percentage, including "%"
PERSON : People, including fictional
PRODUCT : Objects, vehicles, foods, etc. (not services)
QUANTITY : Measurements, as of weight or distance
TIME : Times smaller than a day
WORK_OF_ART : Titles of books, songs, etc.


We’ll create a function to extract the named entity tags from each Doc object and apply it to the Doc objects in the DataFrame, storing the named entities in a new column:

In [43]:
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
lyrics_df['Named_Entities'] = lyrics_df['Doc'].apply(extract_named_entities)
lyrics_df['Named_Entities']

0     [ORG, PERSON, CARDINAL, ORG, PERSON, PERSON, T...
1     [PRODUCT, WORK_OF_ART, PERSON, ORG, GPE, CARDI...
2     [ORG, CARDINAL, PERSON, CARDINAL, PERSON, PERS...
3     [PERSON, CARDINAL, ORG, PERSON, CARDINAL, PERS...
4     [CARDINAL, TIME, TIME, PERSON, PERSON, CARDINA...
5     [NORP, ORG, PERSON, GPE, ORG, GPE, WORK_OF_ART...
6     [PERSON, PERSON, PRODUCT, ORG, PERSON, PERSON,...
7     [DATE, DATE, ORDINAL, DATE, TIME, DATE, PERSON...
8     [ORG, PERSON, CARDINAL, DATE, ORG, DATE, ORG, ...
9     [ORDINAL, ORDINAL, PERSON, ORG, GPE, ORG, PERS...
10    [ORDINAL, PERSON, PERSON, ORG, PERSON, ORG, DA...
11    [PERSON, ORG, GPE, ORDINAL, PERSON, GPE, PERSO...
12    [GPE, PERSON, NORP, GPE, EVENT, PERSON, PERSON...
13    [PERSON, WORK_OF_ART, TIME, PERSON, ORG, DATE,...
14    [NORP, NORP, PERSON, PERSON, PERSON, PERSON, D...
15    [PERSON, PERSON, GPE, ORG, PERSON, ORG, PERSON...
16    [CARDINAL, CARDINAL, ORG, PERSON, ORG, GPE, PE...
17    [DATE, DATE, DATE, DATE, PERSON, WORK_OF_A

We can add another column with the words and phrases identified as named entities:



In [44]:
# Define function to extract text tagged with named entities from doc objects
def extract_named_entities(doc):
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
lyrics_df['NE_Words'] = lyrics_df['Doc'].apply(extract_named_entities)
lyrics_df['NE_Words']

0     [(the, Clones, Work, '), (Norman, Mailer, Mi),...
1     [(Visits), (Love), (Love'll), (Brain), (Studde...
2     [(Believe), (two), (Lambo), (two), (Lambo), (L...
3     [(Nigga), (36), (OG), (Nigga), (16), (Nigga), ...
4     [(one), (Six, minutes), (Six, minutes), (Slim,...
5     [(yeahâit), (N.W.A), (âIt), (dopeââ), ...
6     [(Waaaaay), (Finesse), (Iâm, countin, '), (K...
7     [(today), (today), (first), (the, day), (a, li...
8     [(Happiness), (Shot), (Nine), (daily), (Niggas...
9     [(First), (first), (Phil), (Prophecies), (Vill...
10    [(first), (Lookin), (Paranoia), (Shoulda), (bu...
11    [(Gangsta, Dresta), (O.G.), (west, coast), (fi...
12    [(Niggas), (warâI'm), (martian), (Brooklyn),...
13    [(Queen), (Love), (night), (Chi, night, 's), (...
14    [(Pictures), (Hustlers), (Sellin), (Livin), (N...
15    [(Tithes), (Kaio, Ken), (LA), (Pour), (Bowl), ...
16    [(one), (number, one), (caterpillar), (Ohh), (...
17    [(One, day), (The, next, day), (One, day),

Let’s visualize the words and their named entity tags in a single text. Call the first text’s Doc object and use displacy.render to visualize the text with the named entities highlighted and tagged:

In [46]:
# Extract the first Doc object
doc = lyrics_df['Doc'][1]

# Visualize named entity tagging in a single paper
displacy.render(doc, style='ent', jupyter=True)

### Download Enriched Dataset

To save the dataset of doc objects, text reductions and linguistic annotations generated with spaCy, download the final_paper_df DataFrame to your local computer as a .csv file:

In [48]:
# Save DataFrame as csv (in Google Drive)
# Use this step only to save  csv to your computer's working directory
lyrics_df.to_csv('rap_lyrics_with_spaCy_tags.csv')

## Analysis of Linguistic Annotations




### Part of Speech Analysis

In this section, we’ll analyze the part-of-speech tags extracted by spaCy to answer the first research question: Do students use certain parts-of-speech more frequently in Biology texts versus English texts, and does this signify differences in disciplinary conventions?

spaCy counts the number of each part-of-speech tag that appears in each document (for example the number of times the NOUN tag appears in a document). This is called using doc.count_by(spacy.attrs.POS). Here’s how it works on a single sentence:

In [49]:
# Create doc object from single sentence
doc = nlp("This is 'an' example? sentence")

# Print counts of each part of speech in sentence
print(doc.count_by(spacy.attrs.POS))

{95: 1, 87: 1, 97: 3, 90: 1, 92: 2}


spaCy generates a dictionary where the values represent the counts of each part-of-speech term found in the text. The keys in the dictionary correspond to numerical indices associated with each part-of-speech tag. To make the dictionary more legible, let’s associate the numerical index values with their corresponding part of speech tags. In the example below, it’s now possible to see which parts-of-speech tags correspond to which counts:

In [50]:
# Store dictionary with indexes and POS counts in a variable
num_pos = doc.count_by(spacy.attrs.POS)

dictionary = {}

# Create a new dictionary which replaces the index of each part of speech for its label (NOUN, VERB, ADJECTIVE)
for k,v in sorted(num_pos.items()):
  dictionary[doc.vocab[k].text] = v

dictionary

{'AUX': 1, 'DET': 1, 'NOUN': 2, 'PRON': 1, 'PUNCT': 3}

To get the same type of dictionary for each text in a DataFrame, a function can be created to nest the above for loop. First, we’ll create a new DataFrame for the purposes of part-of speech analysis, containing the text filenames, disciplines, and Doc objects. We can then apply the function to each Doc object in the new DataFrame. In this case (and above), we are interested in the simpler, coarse-grained parts of speech.

In [53]:
# Create new DataFrame for analysis purposes
pos_analysis_df = lyrics_df[['Filename', 'Doc']]

# Create list to store each dictionary
num_list = []

# Define a function to get part-of-speech tags and counts and append them to a new dictionary
def get_pos_tags(doc):
    dictionary = {}
    num_pos = doc.count_by(spacy.attrs.POS)
    for k, v in sorted(num_pos.items()):
        dictionary[doc.vocab[k].text] = v
    return dictionary

# Apply function to each doc object in DataFrame and create a new column 'POS_Counts'
pos_analysis_df['POS_Counts'] = pos_analysis_df['Doc'].apply(get_pos_tags)

# Display the updated DataFrame
pos_analysis_df.head()



Unnamed: 0,Filename,Doc,POS_Counts
0,Talib Kweli_lyrics,"(We, sell, crack, to, our, own, out, the, back...","{'ADJ': 1999, 'ADP': 3829, 'ADV': 1660, 'AUX':..."
1,CunninLynguists_lyrics,"(Love, ai, n't, for, the, faint, of, heart, St...","{'ADJ': 1703, 'ADP': 3024, 'ADV': 1389, 'AUX':..."
2,Kanye West_lyrics,"(Well, ,, it, is, a, weepin, ', and, a, moanin...","{'ADJ': 1852, 'ADP': 3044, 'ADV': 2089, 'AUX':..."
3,Deniro Farrar_lyrics,"(Â­, Let, me, give, you, a, little, inside, in...","{'ADJ': 1525, 'ADP': 2860, 'ADV': 1422, 'AUX':..."
4,Eminem_lyrics,"("", Look, ,, I, was, gon, na, go, easy, on, yo...","{'ADJ': 2849, 'ADP': 4970, 'ADV': 3326, 'AUX':..."


From here, we’ll take the part-of-speech counts and put them into a new DataFrame where we can calculate the frequency of each part-of-speech per document. In the new DataFrame, if a paper does not contain a particular part-of-speech, the cell will read NaN (Not a Number).

In [57]:
print(pos_analysis_df.columns)


Index(['Filename', 'Doc', 'POS_Counts'], dtype='object')


In [59]:
print(lyrics_df.columns)


Index(['Filename', 'Text', 'Doc', 'Tokens', 'Lemmas', 'POS', 'Proper_Nouns',
       'Named_Entities', 'NE_Words'],
      dtype='object')


In [62]:
pip install textblob


Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m636.8/636.8 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: textblob
Successfully installed textblob-0.17.1
Note: you may need to restart the kernel to use updated packages.


In [63]:
import pandas as pd
from textblob import TextBlob  # 需要安装 textblob：pip install textblob

# 示例歌词数据
rap_lyrics_data = {
    'Filename': ['song1.txt', 'song2.txt', 'song3.txt'],
    'Text': ['Lyrics of song 1', 'Lyrics of song 2', 'Lyrics of song 3']
}

rap_lyrics_df = pd.DataFrame(rap_lyrics_data)

# 使用 TextBlob 获取情感得分
rap_lyrics_df['Sentiment'] = rap_lyrics_df['Text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# 根据情感得分添加 'Mood' 标签
rap_lyrics_df['Mood'] = pd.cut(rap_lyrics_df['Sentiment'], bins=3, labels=['Negative', 'Neutral', 'Positive'])

# 显示更新后的 DataFrame
rap_lyrics_df.head()




Unnamed: 0,Filename,Text,Sentiment,Mood
0,song1.txt,Lyrics of song 1,0.0,Neutral
1,song2.txt,Lyrics of song 2,0.0,Neutral
2,song3.txt,Lyrics of song 3,0.0,Neutral


In [65]:
print(merged_df.columns)


Index(['Artist', 'File', 'Filename', 'Text'], dtype='object')


In [66]:
import pandas as pd
from textblob import TextBlob

# 示例歌词数据
rap_lyrics_data = {
    'Artist': ['Artist1', 'Artist2', 'Artist3'],
    'File': ['file1.txt', 'file2.txt', 'file3.txt'],
    'Filename': ['song1.txt', 'song2.txt', 'song3.txt'],
    'Text': ['Lyrics of song 1', 'Lyrics of song 2', 'Lyrics of song 3']
}

rap_lyrics_df = pd.DataFrame(rap_lyrics_data)

# 计算情感得分并添加 'Mood' 列
rap_lyrics_df['Sentiment'] = rap_lyrics_df['Text'].apply(lambda x: TextBlob(x).sentiment.polarity)
rap_lyrics_df['Mood'] = pd.cut(rap_lyrics_df['Sentiment'], bins=3, labels=['Negative', 'Neutral', 'Positive'])

# 显示更新后的 DataFrame
rap_lyrics_df


Unnamed: 0,Artist,File,Filename,Text,Sentiment,Mood
0,Artist1,file1.txt,song1.txt,Lyrics of song 1,0.0,Neutral
1,Artist2,file2.txt,song2.txt,Lyrics of song 2,0.0,Neutral
2,Artist3,file3.txt,song3.txt,Lyrics of song 3,0.0,Neutral


In [95]:
print(merged_df.columns)


Index(['Artist', 'File', 'Cleaned_File', 'Filename', 'Text',
       'Cleaned_Filename'],
      dtype='object')


Now you can calculate the amount of times, on average, that each part-of-speech appears in Biology versus English papers. To do so, you use the .groupby() and .mean() functions to group all part-of-speech counts from the Biology texts together and calculate the mean usage of each part-of-speech, before doing the same for the English texts. The following code also rounds the counts to the nearest whole number:

In [96]:
# 打印 DataFrame 的列名
print(merged_df.columns)



Index(['Artist', 'File', 'Cleaned_File', 'Filename', 'Text',
       'Cleaned_Filename'],
      dtype='object')


In [97]:
print("Metadata DataFrame:")
print(metadata_df.shape)
print(metadata_df.head())

print("\nLyrics DataFrame:")
print(lyrics_df.shape)
print(lyrics_df.head())


Metadata DataFrame:
(5, 3)
            Artist                        File     Cleaned_File
0      Talib Kweli      Talib Kweli_lyrics.txt      Talib Kweli
1  CunninLynguists  CunninLynguists_lyrics.txt  CunninLynguists
2       Kanye West       Kanye West_lyrics.txt       Kanye West
3    Deniro Farrar    Deniro Farrar_lyrics.txt    Deniro Farrar
4           Eminem           Eminem_lyrics.txt           Eminem

Lyrics DataFrame:
(5, 3)
                 Filename                       Text Cleaned_Filename
0      Talib Kweli_lyrics      Lyrics of Talib Kweli      Talib Kweli
1  CunninLynguists_lyrics  Lyrics of CunninLynguists  CunninLynguists
2       Kanye West_lyrics       Lyrics of Kanye West       Kanye West
3    Deniro Farrar_lyrics    Lyrics of Deniro Farrar    Deniro Farrar
4           Eminem_lyrics           Lyrics of Eminem           Eminem


In [98]:
import pandas as pd

# 示例数据
metadata_data = {
    'Artist': ['Talib Kweli', 'CunninLynguists', 'Kanye West', 'Deniro Farrar', 'Eminem'],
    'File': ['Talib Kweli_lyrics.txt', 'CunninLynguists_lyrics.txt', 'Kanye West_lyrics.txt', 'Deniro Farrar_lyrics.txt', 'Eminem_lyrics.txt']
}

lyrics_data = {
    'Filename': ['Talib Kweli_lyrics', 'CunninLynguists_lyrics', 'Kanye West_lyrics', 'Deniro Farrar_lyrics', 'Eminem_lyrics'],
    'Text': ['Lyrics of Talib Kweli', 'Lyrics of CunninLynguists', 'Lyrics of Kanye West', 'Lyrics of Deniro Farrar', 'Lyrics of Eminem']
}

# 转换为DataFrame
metadata_df = pd.DataFrame(metadata_data)
lyrics_df = pd.DataFrame(lyrics_data)

# 清理文件名并尝试匹配
metadata_df['Cleaned_File'] = metadata_df['File'].str.replace(r'_lyrics\.txt$', '', regex=True)
lyrics_df['Cleaned_Filename'] = lyrics_df['Filename'].str.replace(r'_lyrics$', '', regex=True)

# 合并DataFrame
merged_df = metadata_df.merge(lyrics_df, left_on='Cleaned_File', right_on='Cleaned_Filename')

# 显示结果
print(merged_df[['Artist', 'File', 'Filename', 'Text']])


            Artist                        File                Filename  \
0      Talib Kweli      Talib Kweli_lyrics.txt      Talib Kweli_lyrics   
1  CunninLynguists  CunninLynguists_lyrics.txt  CunninLynguists_lyrics   
2       Kanye West       Kanye West_lyrics.txt       Kanye West_lyrics   
3    Deniro Farrar    Deniro Farrar_lyrics.txt    Deniro Farrar_lyrics   
4           Eminem           Eminem_lyrics.txt           Eminem_lyrics   

                        Text  
0      Lyrics of Talib Kweli  
1  Lyrics of CunninLynguists  
2       Lyrics of Kanye West  
3    Lyrics of Deniro Farrar  
4           Lyrics of Eminem  


In [99]:
# Merge metadata and lyrics_df into new DataFrame
# Will only keep rows where both metadata and lyrics_df are present
merged_df = metadata_df.merge(lyrics_df, left_on='File', right_on='Filename')


In [100]:
average_pos_df = average_pos_df.reset_index(drop=True)
