## NLP Group Assignment

In [1]:
#Basic libraries for Python
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import seaborn as sns
sns.set_style("whitegrid")
import altair as alt

import warnings

In [2]:
#NLP required libraries
import nltk
import sklearn_crfsuite
import eli5
from sklearn import metrics
from sklearn import pipeline
from string import punctuation

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.tree import *
from nltk.draw import tree
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer as TfidV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from collections import Counter

from gensim.models.word2vec import Word2Vec
from gensim.models.tfidfmodel import TfidfModel

Using TensorFlow backend.


In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\amand\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\amand\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\amand\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amand\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-

True

# LOADING DATA

In [58]:
#loading data
df = pd.read_csv("lyrics.csv")

In [6]:
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [7]:
df.count()

index     362237
song      362235
year      362237
artist    362237
genre     362237
lyrics    266557
dtype: int64

In [8]:
df.shape

(362237, 6)

In [12]:
df['genre'].describe()

count     362237
unique        12
top         Rock
freq      131377
Name: genre, dtype: object

In [15]:
df['genre'].value_counts()

Rock             131377
Pop               49444
Hip-Hop           33965
Not Available     29814
Metal             28408
Other             23683
Country           17286
Jazz              17147
Electronic        16205
R&B                5935
Indie              5732
Folk               3241
Name: genre, dtype: int64

In [24]:
df['lyrics'].value_counts()

12.0      2311
14.0      1371
701.0      269
621.0      266
754.0      262
          ... 
4773.0       1
5840.0       1
4982.0       1
5536.0       1
4594.0       1
Name: lyrics, Length: 5333, dtype: int64

In [31]:
df['lyrics'].isna().sum()

95680

## Preprocessing

In [59]:
#We eliminate the \n character
df['lyrics'] = df['lyrics'].str.replace("\n", " ")

In [61]:
#Lowercasing words
df['lyrics'] = df['lyrics'].str.lower()

In [62]:
#ponctuaction signs: eliminating them as they will not have any prediction power

punctuation_signs = list("?:!.,;@")
for punct_sign in punctuation_signs:
    df['lyrics'] = df['lyrics'].str.replace(punct_sign, '')

## Sentiment Analysis

In [63]:
df_copy = df

In [69]:
df_copy['sentiment'] = None
df_copy.head()

Unnamed: 0,index,song,year,artist,genre,lyrics,sentiment
0,0,ego-remix,2009,beyonce-knowles,Pop,oh baby how you doing you know i'm gonna cut r...,
1,1,then-tell-me,2009,beyonce-knowles,Pop,playin' everything so easy it's like you seem ...,
2,2,honesty,2009,beyonce-knowles,Pop,if you search for tenderness it isn't hard to ...,
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,oh oh oh i oh oh oh i [verse 1] if i wrote a b...,
4,4,black-culture,2009,beyonce-knowles,Pop,party the people the people the party it's pop...,


In [70]:
def recode_sentiment(series):
    if series == 'rock':
        return 'negative'
    else:
        return 'positive'
    
df_copy['sentiment'] = df_copy['sentiment'].apply(recode_sentiment)
df_copy.head()

Unnamed: 0,index,song,year,artist,genre,lyrics,sentiment
0,0,ego-remix,2009,beyonce-knowles,Pop,oh baby how you doing you know i'm gonna cut r...,positive
1,1,then-tell-me,2009,beyonce-knowles,Pop,playin' everything so easy it's like you seem ...,positive
2,2,honesty,2009,beyonce-knowles,Pop,if you search for tenderness it isn't hard to ...,positive
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,oh oh oh i oh oh oh i [verse 1] if i wrote a b...,positive
4,4,black-culture,2009,beyonce-knowles,Pop,party the people the people the party it's pop...,positive


## PoS Tagging

In [74]:
def get_pos_features(words):
    tags = {}
    tagged_words = [ 'has(%s)'% w+'_'+tag for w,tag in nltk.pos_tag(words)]
    
    for tw in tagged_words:
        tags[tw] = 1

    return tags

In [75]:
# Wrapper function for the extraction of features
def extract_features(text):
    features = {}
    
    words = processAll(text)

    word_features = get_word_features(words)
    features.update( word_features )

    negation_features = get_negation_features(words)
    features.update( negation_features )
    
    pos_features = get_pos_features(words)
    features.update( pos_features )

    return features