In [1]:
#Basic Python Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import time
#Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#Scikit-Learn (Machine Learning Library for Python)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
#Evaluation Metrics
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report
from scikitplot.metrics import plot_confusion_matrix
import spacy
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
dev = pd.read_csv('../input/meld-dataset/MELD-RAW/MELD.Raw/dev_sent_emo.csv')
train = pd.read_csv('../input/meld-dataset/MELD-RAW/MELD.Raw/train/train_sent_emo.csv')
test = pd.read_csv('../input/meld-dataset/MELD-RAW/MELD.Raw/test_sent_emo.csv')

In [3]:
#dev.head()
#train.head()
test.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,Why do all youre coffee mugs have numbers on ...,Mark,surprise,positive,0,0,3,19,"00:14:38,127","00:14:40,378"
1,2,Oh. Thats so Monica can keep track. That way ...,Rachel,anger,negative,0,1,3,19,"00:14:40,629","00:14:47,385"
2,3,Y'know what?,Rachel,neutral,neutral,0,2,3,19,"00:14:56,353","00:14:57,520"
3,19,"Come on, Lydia, you can do it.",Joey,neutral,neutral,1,0,1,23,"0:10:44,769","0:10:46,146"
4,20,Push!,Joey,joy,positive,1,1,1,23,"0:10:46,146","0:10:46,833"


In [4]:
train_dev = pd.concat([train,dev])
train_dev

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,also I was the point person on my companys tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731"
1,2,You mustve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442"
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389"
3,4,So lets talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572"
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"
...,...,...,...,...,...,...,...,...,...,...,...
1104,1174,No.,Monica,sadness,negative,113,9,6,2,"00:19:28,792","00:19:29,876"
1105,1175,What? Oh my God! Im gonna miss you so much!,Rachel,sadness,negative,113,10,6,2,"00:19:33,213","00:19:35,965"
1106,1176,Im gonna miss you!,Monica,sadness,negative,113,11,6,2,"00:19:36,175","00:19:37,967"
1107,1177,I mean its the end of an era!,Rachel,sadness,negative,113,12,6,2,"00:19:39,094","00:19:40,928"


In [5]:
train_dev.reset_index(inplace=True,drop=True)

In [6]:
sent = train_dev[['Utterance','Sentiment']]

In [7]:
def custom_encoder(df):
    df.replace(to_replace ="positive", value = 1, inplace=True)
    df.replace(to_replace ="neutral", value = 0, inplace=True)
    df.replace(to_replace ="negative", value = -1, inplace=True)

In [8]:
custom_encoder(sent['Sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [9]:
#Creating an object of WordNetLemmatizer
lm = WordNetLemmatizer()

In [10]:
def data_preprocessing(text_col):
    corpus = []
    for row in text_col:
        new_row = re.sub('[^a-zA-Z]',' ',str(row)).lower().split()
        new_row = [lm.lemmatize(word) for word in new_row if word not in set(stopwords.words('english'))]
        corpus.append(' '.join(str(x) for x in new_row))
    return corpus

In [11]:
transformed = data_preprocessing(sent['Utterance'])

In [12]:
#transformed

In [13]:
tr_df = pd.DataFrame(zip(transformed,sent['Sentiment']), columns= ['Utterance','Sentiment'])

In [14]:
def get_idx(df):   
    indexes = []
    for i,dialogue in enumerate(df):
        if len(dialogue) == 0:
            empty = df.index
            #print(empty)
            indexes.append(i)
    return indexes

In [15]:
empty_idx = get_idx(tr_df['Utterance'])

In [16]:
clean_df = tr_df.drop(empty_idx)
clean_df.reset_index()

Unnamed: 0,index,Utterance,Sentiment
0,0,also point person company transition kl gr system,0
1,1,must hand full,0
2,3,let talk little bit duty,0
3,4,duty right,1
4,5,heading whole division lot duty,0
...,...,...,...
10535,11092,mean gonna live together anymore,-1
10536,11094,oh god gonna miss much,-1
10537,11095,gonna miss,-1
10538,11096,mean end era,-1


In [17]:
text = clean_df['Utterance'].to_list()

In [18]:
text

['also point person company transition kl gr system',
 'must hand full',
 'let talk little bit duty',
 'duty right',
 'heading whole division lot duty',
 'see',
 'perhaps people dump certain amount',
 'good know',
 'go detail',
 'beg',
 'right definite answer monday think say confidence fit well',
 'really',
 'absolutely relax',
 'waitress went last month',
 'know forget',
 'talking',
 'actually know',
 'ok',
 'right well',
 'yeah sure',
 'hey mon',
 'hey hey hey wanna hear something suck',
 'ever',
 'chris say closing bar',
 'way',
 'yeah apparently turning kinda coffee place',
 'coffee gonna hang',
 'got',
 'get beer',
 'hey pick roommate',
 'betcha',
 'italian guy',
 'um mm yeah right',
 'oh god oh god poor monica',
 'wrote poem',
 'look vessel empty nothing inside',
 'touched seem emptier still',
 'think monica empty empty vase',
 'oh totally oh god oh seemed happy',
 'done',
 'hey',
 'hi',
 'ah know building paper route',
 'oh',
 'hi',
 'hi',
 'go',
 'oh well woman interviewed pre

In [19]:
#clean_df['Sentiment'].value_counts()

## Topic Modeling with LDA in Sklearn

In [20]:
tfidf = TfidfVectorizer(use_idf=True, norm= 'l1')
traindata = tfidf.fit_transform(text)

In [21]:
#len(tfidf.get_feature_names())
type(traindata)

scipy.sparse.csr.csr_matrix

### Creating the vocab to represent all the corpus

In [22]:
vocab_tfidf = tfidf.get_feature_names()
len(vocab_tfidf)

4928

### Implementing LDA

In [23]:
# Instantiating LDA
lda_model = LatentDirichletAllocation(n_components=6, max_iter = 20, random_state = 42, batch_size= 500)

# Fitting and transforming model on our tfidf vectorizer to get topics
x_topics = lda_model.fit_transform(traindata)

# Checking topic distribution
topic_words = lda_model.components_

In [24]:
topic_words.shape

(6, 4928)

In [25]:
topic_words

array([[0.16672491, 0.58265694, 0.16667513, ..., 0.34705817, 0.83804244,
        0.27151967],
       [0.16671725, 0.16667686, 0.1666736 , ..., 0.16668495, 0.16675655,
        0.16670535],
       [0.1667268 , 0.16667668, 0.64975365, ..., 0.16669075, 0.16669473,
        0.16669798],
       [0.17397972, 0.16667415, 0.16667527, ..., 0.16701279, 0.62054375,
        0.16683166],
       [0.29279431, 0.16667604, 0.16667465, ..., 0.1666878 , 0.16851304,
        0.16696818],
       [0.16698611, 0.16667531, 0.16667475, ..., 0.16668589, 0.16669356,
        0.166696  ]])

In [26]:
# defining the number of words to print in every topic
n_top_words = 6

for i, topic_dist in enumerate(topic_words):
    #sorting the indices so in the topic_words array
    sorted_topic_dist = np.argsort(topic_dist)
    
    #Fetching the actual words for the sorted indices above
    actual_topic_words = np.array(vocab_tfidf)[sorted_topic_dist]
    
    #Showing top n_top_words per topic
    n_top_topic_words = actual_topic_words[:-n_top_words:-1]
    print ("Topic", str(i+1), n_top_topic_words)

Topic 1 ['huh' 'mean' 'sure' 'chandler' 'thanks']
Topic 2 ['yeah' 'okay' 'oh' 'know' 'god']
Topic 3 ['great' 'thank' 'wait' 'love' 'ohh']
Topic 4 ['sorry' 'come' 'yes' 'tell' 'think']
Topic 5 ['joey' 'good' 'going' 'phoebe' 'ok']
Topic 6 ['hey' 'really' 'right' 'hi' 'see']


In [27]:
actual_topic_words

array(['laugh', 'dock', 'awesome', ..., 'right', 'really', 'hey'],
      dtype='<U25')

In [28]:
# To view what topics are assigned to the douments:

doc_topic = lda_model.transform(traindata)  

# iterating over ever value till the end value
for n in range(20):
    
    # argmax() gives maximum index value
    topic_doc = doc_topic[n].argmax()
    
    # document is n+1  
    print ("Document", n+1, " -- Topic:" ,topic_doc)

Document 1  -- Topic: 4
Document 2  -- Topic: 2
Document 3  -- Topic: 3
Document 4  -- Topic: 3
Document 5  -- Topic: 3
Document 6  -- Topic: 5
Document 7  -- Topic: 4
Document 8  -- Topic: 4
Document 9  -- Topic: 2
Document 10  -- Topic: 4
Document 11  -- Topic: 0
Document 12  -- Topic: 5
Document 13  -- Topic: 3
Document 14  -- Topic: 2
Document 15  -- Topic: 2
Document 16  -- Topic: 0
Document 17  -- Topic: 3
Document 18  -- Topic: 4
Document 19  -- Topic: 5
Document 20  -- Topic: 0


In [29]:
for i in range(20):
    print("Document", i+1,text[i])

Document 1 also point person company transition kl gr system
Document 2 must hand full
Document 3 let talk little bit duty
Document 4 duty right
Document 5 heading whole division lot duty
Document 6 see
Document 7 perhaps people dump certain amount
Document 8 good know
Document 9 go detail
Document 10 beg
Document 11 right definite answer monday think say confidence fit well
Document 12 really
Document 13 absolutely relax
Document 14 waitress went last month
Document 15 know forget
Document 16 talking
Document 17 actually know
Document 18 ok
Document 19 right well
Document 20 yeah sure
