# BLU09 - Information Extraction

In [3]:
# importing needed packages here

import os
import re
import spacy
import hashlib
import numpy as np
import pandas as pd
import json

from tqdm import tqdm
from collections import Counter
from spacy.matcher import Matcher
from sklearn.metrics import accuracy_score
from nltk.tokenize import WordPunctTokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from utils import remove_punctuation, remove_stopwords
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from textblob import TextBlob
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

def _hash(s):
    return hashlib.sha256(json.dumps(str(s)).encode()).hexdigest()

cpu_count = int(os.cpu_count()) if os.cpu_count() != None else 4

In [4]:
train_path = "data/train.csv"
test_path = "data/test.csv"
df_train_base = pd.read_csv(train_path)
df_test_base = pd.read_csv(test_path)

df_train_base.head()

Unnamed: 0,Lyrics,Genre
0,[Intro: Method Man w/ sample] + (Sunny valenti...,Hip Hop
1,[Sean Paul:]. Aye. It's Sean Paul 'long side. ...,Pop
2,You've changed your tune. many times since we'...,Rock
3,I got all these J's rolled up. And got all the...,Hip Hop
4,Look I'm standing naked before you. Don't you ...,Rock


In [5]:
df_train_vectorized = pd.read_csv("data/train_vectorized.csv")
df_test_vectorized = pd.read_csv("data/test_vectorized.csv")
df_train_vectorized = df_train_vectorized.drop(columns="Unnamed: 0")
df_test_vectorized = df_test_vectorized.drop(columns="Unnamed: 0")

In [6]:
df_train_vectorized_carmen = pd.read_parquet("data/features_carmen.parquet", engine="fastparquet")
df_train_vectorized_carmen.head()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature491,feature492,feature493,feature494,feature495,feature496,feature497,feature498,feature499,feature500
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.064383,0.016288,0.0,0.0,0.025035,0.0,628.0,0.431529,3.39172,0.011031
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040077,0.0,...,0.027639,0.020977,0.0,0.0,0.0,0.0,503.0,0.328032,3.145129,0.340005
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.12957,0.0,0.0,0.0,316.0,0.348101,3.572785,0.140278
3,0.0,0.0,0.0,0.040328,0.0,0.0,0.0,0.0,0.036805,0.0,...,0.0,0.09632,0.0,0.0,0.0,0.0,524.0,0.282443,3.396947,0.01436
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047215,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,203.0,0.408867,3.162562,0.387857


In [7]:
df_test_vectorized.shape

(3354, 100)

In [8]:
df_train = df_train_base.copy()
df_train

Unnamed: 0,Lyrics,Genre
0,[Intro: Method Man w/ sample] + (Sunny valenti...,Hip Hop
1,[Sean Paul:]. Aye. It's Sean Paul 'long side. ...,Pop
2,You've changed your tune. many times since we'...,Rock
3,I got all these J's rolled up. And got all the...,Hip Hop
4,Look I'm standing naked before you. Don't you ...,Rock
...,...,...
39048,It's the end of the movie. And your story's be...,Pop
39049,Uh oh. You live in this lane. Your body's insa...,Hip Hop
39050,Shawty swing my way put that ass all in my fac...,Hip Hop
39051,Born in caught out. Care out fear in. Gear in ...,Rock


In [9]:
nlp = spacy.load('en_core_web_md')
nlp.add_pipe("merge_entities", after="ner")
en_stopwords = nlp.Defaults.stop_words

In [10]:
# docs = list(tqdm(nlp.pipe(df_train["Lyrics"], batch_size=20, n_process=cpu_count-1), total=len(df_train["Lyrics"])))
# docs[:3]

In [11]:
# X_train, X_test, y_train, y_test = train_test_split(
#     df_train_base.drop(columns=['Genre']), 
#     df_train_base['Genre'], 
#     test_size=0.2, 
#     random_state=42,
#     stratify=df_train_base['Genre']
# )

In [12]:
# y_train

In [13]:
def get_precision_recall(y_test, y_pred):
    """Returns the precision and recall of the helpfulness class (label = 1)
    
    Parameters:
        y_test (Series): Labels corresponding to X_test
        y_pred (Series): Predictions corresponding to X_test

    Returns:
        precision (float): The precision score of the helpfulness class (1) on the test data
        recall (float): The recall score of the helpfulness class (1) on the test data
    """

    precision = precision_score(y_test, y_pred, pos_label=1)
    recall = recall_score(y_test, y_pred, pos_label=1)
    return precision, recall

In [14]:
def train_model_naive_bayes(X_train_vec, y_train, X_test_vec, y_test):
    """Returns a fitted Multinomial Naive Bayes model, the predictions on the test set
    and the precision and recall scores for these predictions
    
    Parameters:
        X_train_vec (Series): Vectorized text data for training
        y_train (Series): Labels corresponding to X_train
        X_test_vec (Series): Vectorized text data for testing
        y_test (Series): Labels corresponding to X_test

    Returns:
        clf (MultinomialNB): MultinomialNB classifier fitted to the vectorized training data
        y_pred (Series): The predictions computed with our classifier
        precision (float): The precision score of the helpfulness class (1) on the test data
        recall (float): The recall score of the helpfulness class (1) on the test data
    """
    
    clf =  MultinomialNB()
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    precision, recall = get_precision_recall(y_test, y_pred)

    return clf, y_pred, precision, recall


In [15]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a column from the dataframe to perform additional transformations on
    """ 
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self

class TextSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def transform(self, X):
        return X[self.key]

class NumberSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def transform(self, X):
        return X[[self.key]]

In [16]:
def average_word_length(text):
    words = text.split()
    return sum(len(word) for word in words) / len(words) if words else 0

def average_sentence_length(doc):
    sentences = [sent.text for sent in doc.sents]
    return sum(len(sent.split()) for sent in sentences) / len(sentences) if sentences else 0

def count_adjectives(doc):
    adjectives = [token.text for token in doc if token.pos_ == "ADJ"]
    return len(adjectives)

def count_unique_words(doc):
    unique_words = set(token.text.lower() for token in doc if token.is_alpha)
    return len(unique_words)

swear_words = [
    "fuck", "shit", "bitch", "asshole", "dick", "cunt", "bastard",
    "motherfucker", "cock", "piss", "twat", "ass", "damn", "hell",
    "bollocks", "arsehole", "wanker", "prick", "slut", "whore", "fucking", "mufuckas", "fuckin", "motherfuckin",
    "muthafuckin", "nigga", "niggas"
] 
def count_swear_words(text):
    tokenizer = WordPunctTokenizer()
    words = tokenizer.tokenize(text.lower())
    swear_word_count = sum(word in swear_words for word in words)
    return swear_word_count

pop_words = ['love', 'kiss', 'baby', 'dance', 'oh']

def count_pop_words(text):
    tokenizer = WordPunctTokenizer()
    words = tokenizer.tokenize(text.lower())
    pop_words_count = sum(word in pop_words for word in words)
    return pop_words_count

def sentiment_score_extractor(text):
    return TextBlob(text).sentiment.polarity

def unique_word_density(text):
    return len(set(text.split())) / len(text.split()) if len(text.split()) > 0 else 0


In [17]:
docs = list(tqdm(nlp.pipe(df_train_base["Lyrics"], batch_size=20, n_process=cpu_count-1), total=len(df_train_base["Lyrics"])))
docs[:3]

100%|██████████| 39053/39053 [09:31<00:00, 68.32it/s] 


[[Intro: Method Man w/ sample] + (Sunny valentine). We got butter (8X). (The gun'll go the gun'll go.... The gun'll go...). [Raekwon]. Aiyo one thing for sure keep you of all. Keep a nice crib fly away keep to the point. Keep niggaz outta ya face who snakes. Keep bitches in they place keep the mac in a special place. Keep moving for papes keep cool keep doing what you doing. Keep it fly keep me in the crates. Cuz I will erase shit on the real note you'se a waste. It's right here for you I will lace you. Rip you and brace you put a nice W up on ya face. Word to mother you could get chased. It's nothing to taste blood on a thug if he gotta go. All I know is we be giving grace. This is a place from where we make tapes. We make 'em everywhere still in all we be making base. Y'all be making paste these little niggaz they be making shapes. Our shit is art yours is traced. [Chorus: Sunny Valentine]. This is the way that we rolling in the streets. You know when we roll we be packing that heat.

In [18]:
docs_test = list(tqdm(nlp.pipe(df_test_base["Lyrics"], batch_size=20, n_process=cpu_count-1), total=len(df_test_base["Lyrics"])))
docs_test[:3]

100%|██████████| 3354/3354 [01:08<00:00, 48.96it/s] 


[I was nineteen when I came to town they called it the Summer of Love. They were burning babies burning flags. The hawks against the doves. I took a job in the steamie down on Cauldrum Street. And I fell in love with a laundry girl who was working next to me. Oh she was a rare thing fine as a bee's wing. So fine a breath of wind might blow her away. She was a lost child oh she was running wild. She said "As long as there's no price on love I'll stay.. And you wouldn't want me any other way". Brown hair zig-zag around her face and a look of half-surprise. Like a fox caught in the headlights there was animal in her eyes. She said "Young man oh can't you see I'm not the factory kind. If you don't take me out of here I'll surely lose my mind". Oh she was a rare thing fine as a bee's wing. So fine that I might crush her where she lay. She was a lost child she was running wild. She said "As long as there's no price on love I'll stay.. And you wouldn't want me any other way". We busked around

In [30]:
def pre_process(df, docs_, vectorized):
    df_processed = df.copy()
    df_processed["Lyrics"] = df_processed["Lyrics"].apply(remove_punctuation)
    df_processed["Lyrics"] = df_processed["Lyrics"].apply(remove_stopwords, 
                                                      stopwords = en_stopwords, 
                                                      tokenizer = WordPunctTokenizer())
    # df_processed["nb_words"] = df_processed["Lyrics"].apply(lambda text: len(text.split()))
    df_processed["doc_length"] = df_processed["Lyrics"].apply(lambda text: len(text))
    df_processed['swear_words_count'] = df_processed['Lyrics'].apply(count_swear_words)
    
    df_processed['pop_words_count'] = df_processed['Lyrics'].apply(count_pop_words)
    # df_processed["avg_word_length"] = df_processed["Lyrics"].apply(average_word_length)
    
    df_processed['sentiment_score'] = df_processed['Lyrics'].apply(sentiment_score_extractor)
    
    df_processed['unique_word_density'] = df_processed['Lyrics'].apply(unique_word_density)
    
    df_processed['adj_count'] = [count_adjectives(doc) for doc in docs_]
    # df_processed["avg_sentence_length"] = [sum(len(sent) for sent in doc.sents) / len(list(doc.sents)) for doc in docs_]
    
    df_processed['unique_word_count'] = [count_unique_words(doc) for doc in docs_]
    
    df_processed = pd.concat([df_processed, vectorized], axis=1)
    
    # df_processed = pd.concat([df_processed, c_vectorized], axis=1)
    
    # df_processed = df_processed.drop(columns="Lyrics")
    
    return df_processed

In [31]:
df_processed = df_train_base.copy()
df_processed = pre_process(df_processed, docs, df_train_vectorized)
df_test_processed = pre_process(df_test_base.copy(), docs_test, df_test_vectorized)

In [21]:
df_processed.head()

Unnamed: 0,Lyrics,Genre,doc_length,swear_words_count,pop_words_count,sentiment_score,unique_word_density,adj_count,unique_word_count,0,...,90,91,92,93,94,95,96,97,98,99
0,intro method man w sample sunny valentine got ...,Hip Hop,1602,5,0,0.052575,0.737226,24,253,-0.136609,...,-0.027427,0.102448,-0.190991,-0.315929,0.16753,-0.343603,-0.142754,-0.543318,-0.231015,0.698872
1,sean paul aye sean paul long mandem called jay...,Pop,1010,1,16,0.293681,0.505051,17,144,-0.566853,...,0.17699,-0.680539,0.437265,0.02949,0.325816,-0.393507,0.217912,0.085732,0.803546,0.631783
2,youve changed tune times weve met ill recogniz...,Rock,775,0,0,0.1375,0.462121,14,106,0.445194,...,-0.150421,-0.704043,0.435571,0.133274,0.570402,-0.13881,0.194687,-0.795902,-0.058599,-0.51565
3,got js rolled got drinks poured bought bottles...,Hip Hop,1189,2,2,0.002126,0.478049,10,142,0.42876,...,0.204813,-0.295936,0.591472,0.269355,0.139109,-1.660812,-0.344285,-0.867631,-0.044837,0.059782
4,look im standing naked dont want sex scream lo...,Rock,436,0,14,0.376923,0.5,10,78,-0.530487,...,0.226589,-1.219079,0.298557,0.031436,-0.121683,0.099747,0.525335,0.087373,0.575657,0.224653


In [22]:
text_pipe = Pipeline([
    ('selector', TextSelector(key='Lyrics')),
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=6))
])

# nb_words_pipe = Pipeline([
#     ('selector', NumberSelector(key='nb_words')),
#     ('scaler', StandardScaler())
# ])

unique_word_count_pipe = Pipeline([
    ('selector', NumberSelector(key='unique_word_count')),
    ('scaler', StandardScaler())
])

swear_words_count_pipe = Pipeline([
    ('selector', NumberSelector(key='swear_words_count')),
    ('scaler', StandardScaler())
])

pop_words_count_pipe = Pipeline([
    ('selector', NumberSelector(key='pop_words_count')),
    ('scaler', StandardScaler())
])

sentiment_score_pipe = Pipeline([
    ('selector', NumberSelector(key='sentiment_score')),
    ('scaler', StandardScaler())
])

unique_word_density_pipe = Pipeline([
    ('selector', NumberSelector(key='unique_word_density')),
    ('scaler', StandardScaler())
])

doc_length_pipe = Pipeline([
    ('selector', NumberSelector(key='doc_length')),
    ('scaler', StandardScaler())
])

adj_count_pipe = Pipeline([
    ('selector', NumberSelector(key='adj_count')),
    ('scaler', StandardScaler())
])

# avg_word_length_pipe = Pipeline([
#     ('selector', NumberSelector(key='avg_word_length')),
#     ('scaler', StandardScaler())
# ])

# avg_sentence_length_pipe = Pipeline([
#     ('selector', NumberSelector(key='avg_sentence_length')),
#     ('scaler', StandardScaler())
# ])

feats = FeatureUnion([
    ('Lyrics', text_pipe),
    # ('nb_words', nb_words_pipe),
    # ('doc_length', doc_length_pipe),
    ('adj_count', adj_count_pipe),
    ('swear_words_count', swear_words_count_pipe),
    ('unique_word_count', unique_word_count_pipe),
    ('pop_word_count', pop_words_count_pipe),
    ('sentiment_score', sentiment_score_pipe),
    ('unique_word_density', unique_word_density_pipe),
    # ('drop_column', ColumnTransformer([
    #     ('drop_lyrics', FunctionTransformer(lambda X: X.drop(columns=['Lyrics']), validate=False), [])
    # ], remainder='passthrough'))
    # ('avg_word_length', avg_word_length_pipe),
    # ('avg_sentence_length', avg_sentence_length_pipe)
])

In [23]:
def predict(X_train, y_train, X_test, feats, clf):
    pipe = Pipeline([
        ('feats', feats),
        # ('feature_selection', SelectKBest(mutual_info_classif, k=500)),  
        ('clf', clf)
    ])
    
    pipe.fit(X_train, y_train)
    
    y_pred = pipe.predict(X_test)
    return pipe, y_pred
    
def improved_pipeline(feats, X_train, X_test, y_train, y_test, clf):
    """
    Creates a pipeline with the provided feature union and a Random Forest classifier.
    Fits the pipeline to the train data and makes a prediction with the test data.
    Outputs the fitted pipeline and the accuracy of the prediction.
    """
    pipe, y_pred = predict(X_train, y_train, X_test, feats, clf)
    
    f1, report, matrix = get_scores(y_pred, y_test)

    return pipe, f1, report, matrix, y_pred

In [24]:
def get_scores(y_test, y_pred):
    # Specify average='macro' for multiclass classification
    f1_score_true_label = f1_score(y_test, y_pred, average='weighted')
    classification_report_pred = classification_report(y_test, y_pred)
    confusion_matrix_pred = confusion_matrix(y_test, y_pred)

    return f1_score_true_label, classification_report_pred, confusion_matrix_pred

In [32]:
df_processed.head()

Unnamed: 0,Lyrics,Genre,doc_length,swear_words_count,pop_words_count,sentiment_score,unique_word_density,adj_count,unique_word_count,0,...,90,91,92,93,94,95,96,97,98,99
0,intro method man w sample sunny valentine got ...,Hip Hop,1602,5,0,0.052575,0.737226,24,253,-0.136609,...,-0.027427,0.102448,-0.190991,-0.315929,0.16753,-0.343603,-0.142754,-0.543318,-0.231015,0.698872
1,sean paul aye sean paul long mandem called jay...,Pop,1010,1,16,0.293681,0.505051,17,144,-0.566853,...,0.17699,-0.680539,0.437265,0.02949,0.325816,-0.393507,0.217912,0.085732,0.803546,0.631783
2,youve changed tune times weve met ill recogniz...,Rock,775,0,0,0.1375,0.462121,14,106,0.445194,...,-0.150421,-0.704043,0.435571,0.133274,0.570402,-0.13881,0.194687,-0.795902,-0.058599,-0.51565
3,got js rolled got drinks poured bought bottles...,Hip Hop,1189,2,2,0.002126,0.478049,10,142,0.42876,...,0.204813,-0.295936,0.591472,0.269355,0.139109,-1.660812,-0.344285,-0.867631,-0.044837,0.059782
4,look im standing naked dont want sex scream lo...,Rock,436,0,14,0.376923,0.5,10,78,-0.530487,...,0.226589,-1.219079,0.298557,0.031436,-0.121683,0.099747,0.525335,0.087373,0.575657,0.224653


In [25]:
Y = df_processed["Genre"]
X = df_processed.drop(columns=["Genre"])

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)
pipeline_model, f1, report, matrix, y_pred_rand = improved_pipeline(feats, X_train, X_test, y_train, y_test, RandomForestClassifier())
print(f"F1 Score: {f1}\n")
print(f"Classification Report:\n {report}\n")
print(f"Confusion Matrix:\n {matrix}\n")

F1 Score: 0.7578036309205398

Classification Report:
               precision    recall  f1-score   support

     Hip Hop       0.79      0.90      0.84      1572
         Pop       0.20      0.72      0.32       634
        Rock       0.97      0.66      0.78      5605

    accuracy                           0.71      7811
   macro avg       0.66      0.76      0.65      7811
weighted avg       0.87      0.71      0.76      7811


Confusion Matrix:
 [[1407  126   39]
 [ 110  454   70]
 [ 263 1652 3690]]


In [29]:
y_train_labeled = y_train.copy()
y_train_labeled = y_train_labeled.map({'Hip Hop': 0, 'Rock': 1, 'Pop': 2})
y_test_labeled = y_test.copy()
y_test_labeled = y_test_labeled.map({'Hip Hop': 0, 'Rock': 1, 'Pop': 2})
pipeline_model, f1, report, matrix, y_pred_xgb = improved_pipeline(feats, X_train, X_test, y_train_labeled, y_test_labeled, XGBClassifier(eval_metric='mlogloss'))
print(f"F1 Score: {f1}\n")
print(f"Classification Report:\n {report}\n")
print(f"Confusion Matrix:\n {matrix}\n")

F1 Score: 0.7608700366287986

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.90      0.86      1609
           1       0.90      0.73      0.80      4688
           2       0.44      0.65      0.53      1514

    accuracy                           0.75      7811
   macro avg       0.72      0.76      0.73      7811
weighted avg       0.79      0.75      0.76      7811


Confusion Matrix:
 [[1452   35  122]
 [ 159 3408 1121]
 [ 169  356  989]]


In [27]:
y_train_labeled = y_train.copy()
y_train_labeled = y_train_labeled.map({'Hip Hop': 0, 'Rock': 1, 'Pop': 2})
y_test_labeled = y_test.copy()
y_test_labeled = y_test_labeled.map({'Hip Hop': 0, 'Rock': 1, 'Pop': 2})
pipeline_model, f1, report, matrix, y_pred_logi = improved_pipeline(feats, X_train, X_test, y_train_labeled, y_test_labeled, LogisticRegression())
print(f"F1 Score: {f1}\n")
print(f"Classification Report:\n {report}\n")
print(f"Confusion Matrix:\n {matrix}\n")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


F1 Score: 0.7439284569886024

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.90      0.84      1543
           1       0.87      0.73      0.79      4547
           2       0.47      0.61      0.53      1721

    accuracy                           0.74      7811
   macro avg       0.71      0.75      0.72      7811
weighted avg       0.76      0.74      0.74      7811


Confusion Matrix:
 [[1396   45  102]
 [ 154 3308 1085]
 [ 230  446 1045]]


In [36]:
pipe, y_pred = predict(X_train=df_processed.drop(columns="Genre") ,
                       y_train=df_processed["Genre"].map({'Hip Hop': 0, 'Rock': 1, 'Pop': 2}),
                       X_test=df_test_processed,
                       feats=feats, 
                       clf=XGBClassifier(eval_metric='mlogloss'))

In [43]:
y_pred_mapped = pd.DataFrame(y_pred, columns=["Genre"])["Genre"].map({0: 'Hip Hop', 1: 'Rock', 2: 'Pop'})

In [44]:
y_pred_mapped

0       Rock
1       Rock
2       Rock
3        Pop
4       Rock
        ... 
3349    Rock
3350    Rock
3351    Rock
3352    Rock
3353    Rock
Name: Genre, Length: 3354, dtype: object

In [45]:
df_sub = pd.DataFrame(y_pred_mapped, columns=["Genre"])#.to_csv("team_4_13_14.csv")


In [46]:
df_sub.to_csv("team_4_16_36.csv", index=False)

In [28]:
df_processed.groupby('Genre')['adj_count'].mean()
# unique_word_count

Genre
Hip Hop    24.670038
Pop        13.236069
Rock       10.541357
Name: adj_count, dtype: float64