In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import nltk


In [2]:
#loading data set

dataset = pd.read_csv("../raw_data/combined_happysadangry.csv")
dataset_lyrics = pd.read_csv("../raw_data/labeled_lyrics_cleaned.csv")



In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,text,word_label,source
0,0,i didnt feel humiliated,sad,HuggingFace
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace
3,4,i am feeling grouchy,angry,HuggingFace
4,5,ive been feeling a little burdened lately wasn...,sad,HuggingFace


In [4]:
dataset_lyrics.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371


In [5]:
# Dealing With Missing Values

dataset.isna().sum()
dataset_lyrics.isna().sum()


Unnamed: 0    0
artist        0
seq           0
song          0
label         0
dtype: int64

# Data Cleaning And Text Preprocessing.

I. Removing URL.

II. Removing all irrelevant characters (Numbers and Punctuation).

III. Convert all characters into lowercase.

IV. Tokenization // NLTK

V. Removing Stopwords

VI. Stemming and Lemmatization

VII. Remove the words having length <= 2

VIII. Convert the list of tokens into back to the string


In [6]:
# Removing URL

import re

def clean_url(review_text):
    return re.sub(r'http:\S+', '',review_text)

dataset['review_text'] = dataset['text'].apply(clean_url)
#dataset_lyrics['review_seq'] = dataset_lyrics['seq'].apply(clean_url)

In [7]:
# Removing all irrelevant characters

def clean_non_alphanumeric(review_text):
    return re.sub('[^a-zA-Z]', '', review_text)

dataset['review_text'] = dataset['review_text'].apply(clean_url)
#dataset_lyrics['review_seq'] = dataset_lyrics['review_seq'].apply(clean_url)

In [8]:
# Convert all characters into lowercase

def clean_non_lowercase(review_text):
    return str(review_text).lower()

dataset['review_text'] = dataset['review_text'].apply(clean_non_lowercase)
#dataset_lyrics['review_seq'] = dataset_lyrics['review_seq'].apply(clean_non_lowercase)

In [9]:
#Using nltk // Tokenize

from nltk.tokenize import word_tokenize

def clean_token(review_text):
    return word_tokenize(review_text)

dataset['review_text'] = dataset['review_text'].apply(clean_token)
#dataset_lyrics['review_seq'] = dataset_lyrics['review_seq'].apply(clean_token)




In [10]:
# Removing Stopwords  ¿Customize?
from nltk.corpus import stopwords

stopwords.words('english')
stop_words = set(stopwords.words('english'))

def clean_stopwords(token):
    return [item for item in token if item not in stop_words]

dataset['review_text_sw'] = dataset['review_text'].apply(clean_stopwords)



In [11]:
# reducing the inflectional forms of each word

#Stemming 
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
def clean_stem(token):
    return [stemmer.stem(i) for i in token ]

dataset['review_text_sw'] = dataset['review_text_sw'].apply(clean_stem)
dataset['review_text'] = dataset['review_text'].apply(clean_stem)

In [12]:
#Lemmatization

from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

def clean_lemma(token):
    return [lemma.lemmatize(word=w, pos='v') for w in token]

dataset['review_text_sw'] = dataset['review_text_sw'].apply(clean_lemma)
dataset['review_text'] = dataset['review_text'].apply(clean_lemma)

In [14]:
#Remove the words having length <= 2

def clean_lengh(token):
    return [i for i in token if len(i) >2]

dataset['review_text_2'] = dataset['review_text'].apply(clean_lengh)



In [15]:
#Convert the list of tokens into back to the string

def convert_to_string(listReview):
    return ' '.join(listReview)

dataset['review_text_sw'] = dataset['review_text_sw'].apply(convert_to_string)
dataset['review_text_2'] = dataset['review_text_2'].apply(convert_to_string)
dataset['review_text'] = dataset['review_text'].apply(convert_to_string)

In [16]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,text,word_label,source,review_text,review_text_sw,review_text_sw_2
0,0,i didnt feel humiliated,sad,HuggingFace,i didnt feel humili,didnt feel humili,didnt feel humili
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace,i can go from feel so hopeless to so damn hope...,go feel hopeless damn hope around someon care ...,feel hopeless damn hope around someon care awak
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace,im grab a minut to post i feel greedi wrong,im grab minut post feel greedi wrong,grab minut post feel greedi wrong
3,4,i am feeling grouchy,angry,HuggingFace,i be feel grouchi,feel grouchi,feel grouchi
4,5,ive been feeling a little burdened lately wasn...,sad,HuggingFace,ive be feel a littl burden late wasnt sure whi...,ive feel littl burden late wasnt sure,ive feel littl burden late wasnt sure
