In [320]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [321]:
import nltk

In [322]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('treebank')
"""
punkt: This NLTK resource is a pre-trained model used for tokenizing text 
into sentences or words. It's useful for breaking down paragraphs or documents into meaningful linguistic units.

stopwords: NLTK provides lists of stopwords for various languages. 
Stopwords are common words (like "and", "the", "in") that are often filtered out from text data because 
they typically do not contribute much to the meaning of the text.

wordnet: WordNet is a lexical database for the English language. 
It groups English words into sets of synonyms (synsets), provides short definitions, and records the various 
semantic relationships between these sets.

averaged_perceptron_tagger: This NLTK resource is a part-of-speech (POS) 
tagger based on the averaged perceptron algorithm. It assigns grammatical 
categories (like noun, verb, adjective) to words in a sentence, which is useful for syntax and semantic analysis.

maxent_ne_chunker: This is a named entity chunker trained on the CoNLL 2002 Dutch 
and Spanish data sets. It identifies named entities (like names of people, organizations, locations) in text.

words: This NLTK corpus contains a list of words in various languages. 
It's useful for tasks that require a large vocabulary set or dictionary.

treebank: The Penn Treebank Project provides a large annotated corpus of English, 
including syntactic and semantic annotations. It's used for training and testing various
natural language processing tools and algorithms.
"""



[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/user/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/user/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to /home/user/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


'\npunkt: This NLTK resource is a pre-trained model used for tokenizing text \ninto sentences or words. It\'s useful for breaking down paragraphs or documents into meaningful linguistic units.\n\nstopwords: NLTK provides lists of stopwords for various languages. \nStopwords are common words (like "and", "the", "in") that are often filtered out from text data because \nthey typically do not contribute much to the meaning of the text.\n\nwordnet: WordNet is a lexical database for the English language. \nIt groups English words into sets of synonyms (synsets), provides short definitions, and records the various \nsemantic relationships between these sets.\n\naveraged_perceptron_tagger: This NLTK resource is a part-of-speech (POS) \ntagger based on the averaged perceptron algorithm. It assigns grammatical \ncategories (like noun, verb, adjective) to words in a sentence, which is useful for syntax and semantic analysis.\n\nmaxent_ne_chunker: This is a named entity chunker trained on the CoN

In [323]:
df=pd.read_csv("train_sent_emo.csv")

In [324]:
df.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,also I was the point person on my company’s tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731"
1,2,You must’ve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442"
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389"
3,4,So let’s talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572"
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"


In [325]:
list(df.columns)

['Sr No.',
 'Utterance',
 'Speaker',
 'Emotion',
 'Sentiment',
 'Dialogue_ID',
 'Utterance_ID',
 'Season',
 'Episode',
 'StartTime',
 'EndTime']

In [326]:
df.shape

(9989, 11)

In [327]:
df.dtypes

Sr No.           int64
Utterance       object
Speaker         object
Emotion         object
Sentiment       object
Dialogue_ID      int64
Utterance_ID     int64
Season           int64
Episode          int64
StartTime       object
EndTime         object
dtype: object

In [328]:
df.memory_usage()

Index              80
Sr No.          79912
Utterance       79912
Speaker         79912
Emotion         79912
Sentiment       79912
Dialogue_ID     79912
Utterance_ID    79912
Season          79912
Episode         79912
StartTime       79912
EndTime         79912
dtype: int64

In [329]:
df.isnull().sum().sum() 


0

In [330]:
"""

Data Description:

Sr No.: This is the unique index number for each row in the dataset.

Utterance: This refers to the actual spoken words of a character in a dialogue.

Speaker: This is the name of the person who spoke the utterance.

Emotion: This describes the emotional state or expression conveyed in the utterance.

Sentiment: This categorizes the overall opinion expressed in the utterance as positive, negative, or neutral.

Dialogue_ID: This is a unique identifier for each conversation or dialogue session.

Utterance_ID: This is a unique identifier for each individual utterance within a conversation.

Season: If applicable, this indicates the season number of the show or series.

Episode: If applicable, this indicates the episode number within a season.

StartTime: This is the timestamp or start time of the utterance within the episode.

EndTime: This is the timestamp or end time of the utterance within the episode.

Using Columns we can perform some operations to achecve the followning task 

We can identify which characters or speakers are the most active or have the most lines in the dialogue.
It helps us understand the emotions and opinions expressed by different characters.
Emotion and Sentiment Analysis:

We can explore how emotions and sentiments change across different episodes or seasons of the show.
It allows us to detect patterns in how sentiments or emotions evolve throughout the series.
Dialogue Structure:

Analyzing the structure and flow of dialogues based on Dialogue_ID and Utterance_ID helps us understand how conversations progress.
We can observe how dialogues develop within episodes and across multiple seasons.
Temporal Analysis:

We can investigate if there are any time-based patterns in the emotions or sentiments expressed during dialogues.
Comparing emotions or sentiments at different start and end times of episodes gives insights into temporal dynamics.
Season and Episode Patterns:

By analyzing emotions or sentiments across seasons and episodes, we can identify significant changes or trends.
It helps us track the development of characters or themes over the course of the serie
"""

'\n\nData Description:\n\nSr No.: This is the unique index number for each row in the dataset.\n\nUtterance: This refers to the actual spoken words of a character in a dialogue.\n\nSpeaker: This is the name of the person who spoke the utterance.\n\nEmotion: This describes the emotional state or expression conveyed in the utterance.\n\nSentiment: This categorizes the overall opinion expressed in the utterance as positive, negative, or neutral.\n\nDialogue_ID: This is a unique identifier for each conversation or dialogue session.\n\nUtterance_ID: This is a unique identifier for each individual utterance within a conversation.\n\nSeason: If applicable, this indicates the season number of the show or series.\n\nEpisode: If applicable, this indicates the episode number within a season.\n\nStartTime: This is the timestamp or start time of the utterance within the episode.\n\nEndTime: This is the timestamp or end time of the utterance within the episode.\n\nUsing Columns we can perform some o

In [331]:
df.duplicated().sum()

0

In [332]:
df.describe()

Unnamed: 0,Sr No.,Dialogue_ID,Utterance_ID,Season,Episode
count,9989.0,9989.0,9989.0,9989.0,9989.0
mean,5262.373511,526.549304,6.14516,4.853739,12.697167
std,3032.169169,302.464741,4.96027,2.389599,7.220392
min,1.0,0.0,0.0,1.0,1.0
25%,2641.0,261.0,2.0,3.0,6.0
50%,5267.0,531.0,5.0,5.0,12.0
75%,7891.0,795.0,9.0,7.0,19.0
max,10478.0,1038.0,23.0,9.0,25.0


In [333]:
columns_info = df.dtypes.to_dict()
print(columns_info)      

{'Sr No.': dtype('int64'), 'Utterance': dtype('O'), 'Speaker': dtype('O'), 'Emotion': dtype('O'), 'Sentiment': dtype('O'), 'Dialogue_ID': dtype('int64'), 'Utterance_ID': dtype('int64'), 'Season': dtype('int64'), 'Episode': dtype('int64'), 'StartTime': dtype('O'), 'EndTime': dtype('O')}


In [334]:
null_values = df.isnull().sum().to_dict()
print(null_values)

{'Sr No.': 0, 'Utterance': 0, 'Speaker': 0, 'Emotion': 0, 'Sentiment': 0, 'Dialogue_ID': 0, 'Utterance_ID': 0, 'Season': 0, 'Episode': 0, 'StartTime': 0, 'EndTime': 0}


In [335]:
# Describe all columns including categorical and numeric
description = df.describe(include='all').to_dict()
print(description)

{'Sr No.': {'count': 9989.0, 'unique': nan, 'top': nan, 'freq': nan, 'mean': 5262.3735108619485, 'std': 3032.1691693883513, 'min': 1.0, '25%': 2641.0, '50%': 5267.0, '75%': 7891.0, 'max': 10478.0}, 'Utterance': {'count': 9989, 'unique': 8931, 'top': 'Hey!', 'freq': 79, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}, 'Speaker': {'count': 9989, 'unique': 260, 'top': 'Joey', 'freq': 1510, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}, 'Emotion': {'count': 9989, 'unique': 7, 'top': 'neutral', 'freq': 4710, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}, 'Sentiment': {'count': 9989, 'unique': 3, 'top': 'neutral', 'freq': 4710, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}, 'Dialogue_ID': {'count': 9989.0, 'unique': nan, 'top': nan, 'freq': nan, 'mean': 526.5493042346582, 'std': 302.46474108610954, 'min': 0.0, '25%': 261.0, '50%': 531.0, '7

In [336]:
duplicate_rows=df[df.duplicated()]
print(duplicate_rows)

Empty DataFrame
Columns: [Sr No., Utterance, Speaker, Emotion, Sentiment, Dialogue_ID, Utterance_ID, Season, Episode, StartTime, EndTime]
Index: []


In [337]:
df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,also I was the point person on my company’s tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731"
1,2,You must’ve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442"
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389"
3,4,So let’s talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572"
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"
5,6,"Now you’ll be heading a whole division, so you...",The Interviewer,neutral,neutral,0,5,8,21,"00:16:41,126","00:16:44,337"
6,7,I see.,Chandler,neutral,neutral,0,6,8,21,"00:16:48,800","00:16:51,886"
7,8,But there’ll be perhaps 30 people under you so...,The Interviewer,neutral,neutral,0,7,8,21,"00:16:48,800","00:16:54,514"
8,9,Good to know.,Chandler,neutral,neutral,0,8,8,21,"00:16:59,477","00:17:00,478"
9,10,We can go into detail,The Interviewer,neutral,neutral,0,9,8,21,"00:17:00,478","00:17:02,719"


In [338]:
data=df.copy()

In [339]:
data[['StartTime','EndTime']].head()

Unnamed: 0,StartTime,EndTime
0,"00:16:16,059","00:16:21,731"
1,"00:16:21,940","00:16:23,442"
2,"00:16:23,442","00:16:26,389"
3,"00:16:26,820","00:16:29,572"
4,"00:16:34,452","00:16:40,917"


In [340]:
data['StartTime']=df['StartTime'].str.replace(',','.')
data['EndTime']=df['EndTime'].str.replace(',','.')

In [341]:
data

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,also I was the point person on my company’s tr...,Chandler,neutral,neutral,0,0,8,21,00:16:16.059,00:16:21.731
1,2,You must’ve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,00:16:21.940,00:16:23.442
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,00:16:23.442,00:16:26.389
3,4,So let’s talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,00:16:26.820,00:16:29.572
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,00:16:34.452,00:16:40.917
5,6,"Now you’ll be heading a whole division, so you...",The Interviewer,neutral,neutral,0,5,8,21,00:16:41.126,00:16:44.337
6,7,I see.,Chandler,neutral,neutral,0,6,8,21,00:16:48.800,00:16:51.886
7,8,But there’ll be perhaps 30 people under you so...,The Interviewer,neutral,neutral,0,7,8,21,00:16:48.800,00:16:54.514
8,9,Good to know.,Chandler,neutral,neutral,0,8,8,21,00:16:59.477,00:17:00.478
9,10,We can go into detail,The Interviewer,neutral,neutral,0,9,8,21,00:17:00.478,00:17:02.719


In [342]:
data['StartTime']=pd.to_datetime(data['StartTime'],format='%H:%M:%S.%f')
data['EndTime']=pd.to_datetime(data['EndTime'],format="%H:%M:%S.%f")

In [343]:


#Calculate duration and add as new column
data['Duration'] = data['EndTime'] - data['StartTime']



In [344]:
data.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,Duration
0,1,also I was the point person on my company’s tr...,Chandler,neutral,neutral,0,0,8,21,1900-01-01 00:16:16.059,1900-01-01 00:16:21.731,00:00:05.672000
1,2,You must’ve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,1900-01-01 00:16:21.940,1900-01-01 00:16:23.442,00:00:01.502000
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,1900-01-01 00:16:23.442,1900-01-01 00:16:26.389,00:00:02.947000
3,4,So let’s talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,1900-01-01 00:16:26.820,1900-01-01 00:16:29.572,00:00:02.752000
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,1900-01-01 00:16:34.452,1900-01-01 00:16:40.917,00:00:06.465000


In [345]:
data['Duration_Minutes'] = data['Duration'].dt.total_seconds() / 60

In [346]:
data.drop(['Sr No.', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime', 'Duration'],axis=1,inplace=True)

In [347]:
data.head()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Duration_Minutes
0,also I was the point person on my company’s tr...,Chandler,neutral,neutral,0.094533
1,You must’ve had your hands full.,The Interviewer,neutral,neutral,0.025033
2,That I did. That I did.,Chandler,neutral,neutral,0.049117
3,So let’s talk a little bit about your duties.,The Interviewer,neutral,neutral,0.045867
4,My duties? All right.,Chandler,surprise,positive,0.10775


In [348]:
dfa=data.copy()
dfa=dfa.drop_duplicates()

In [349]:
print(dfa)

                                              Utterance          Speaker  \
0     also I was the point person on my company’s tr...         Chandler   
1                      You must’ve had your hands full.  The Interviewer   
2                               That I did. That I did.         Chandler   
3         So let’s talk a little bit about your duties.  The Interviewer   
4                                My duties?  All right.         Chandler   
5     Now you’ll be heading a whole division, so you...  The Interviewer   
6                                                I see.         Chandler   
7     But there’ll be perhaps 30 people under you so...  The Interviewer   
8                                         Good to know.         Chandler   
9                                 We can go into detail  The Interviewer   
10                               No don’t I beg of you!         Chandler   
11    All right then, we’ll have a definite answer f...  The Interviewer   
12          

In [350]:
dfa.isnull()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Duration_Minutes
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [351]:
dfa.isnull().sum()

Utterance           0
Speaker             0
Emotion             0
Sentiment           0
Duration_Minutes    0
dtype: int64

In [352]:
dfa.isnull().sum().sum()

0

In [353]:
dfa.columns

Index(['Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Duration_Minutes'], dtype='object')

In [354]:
dfa.shape

(9979, 5)

In [355]:
unique_speakers = dfa['Speaker'].unique()

In [356]:
print(unique_speakers)

['Chandler' 'The Interviewer' 'Joey' 'Rachel' 'Monica' 'Phoebe' 'Ross'
 'Sergei' 'Customer' 'Jade' 'Mona' 'Charlie' 'Paleontologist'
 'Professore Clerk' 'Caitlin' 'Nurse' 'Mr. Treeger' 'Carol'
 'The Casting Director' 'Emily' 'Elizabeth' 'Paul' 'The Dry Cleaner'
 'Joey and Chandler' 'Kate' 'The Director' 'Mr. Tribbiani' 'Guru Saj'
 'Wayne' 'Richard' 'Dina' 'Bobby' 'Danny' 'Krista' 'Jill' 'Doug' 'Stevens'
 'Bob' 'Mr. Franklin' 'Director' 'Janice' 'Tony' 'Peter'
 'Ticket Counter Attendant' 'Dr. Long' 'Charlton Heston' 'Joshua' 'Nancy'
 'Kim' 'Joanna' 'Cassie' 'Dr. Rhodes' 'Dr. Johnson' 'Kristen' 'Jester'
 'Sarah' 'Pete' 'The Singing Man' 'Commercial' 'Mark' 'A Female Student'
 'All' 'Cliff' 'Tag' 'Eric' 'Dr. Green' 'Mr. Heckles' 'Mr. Geller'
 'Sophie' 'Singer' 'David' 'Hitchhiker' '1st Customer' '2nd Customer'
 '3rd Customer' 'The Presenter' 'Policeman' 'Duncan' 'Jane' 'Message'
 'Gary' 'Bonnie' 'Woman' 'Leslie' 'Isabella' "Joey's Hand Twin" 'Kiki'
 'Joanne' 'Fireman No. 3' 'Susan' 'Misch

In [357]:
print(dfa['Speaker'].nunique())

260


In [358]:
dfa['Speaker'].duplicated()

0       False
1       False
2        True
3        True
4        True
5        True
6        True
7        True
8        True
9        True
10       True
11       True
12       True
13       True
14      False
15      False
16       True
17       True
18       True
19       True
20       True
21       True
22      False
23       True
24       True
25       True
26       True
27       True
28       True
29       True
        ...  
9959     True
9960     True
9961     True
9962     True
9963     True
9964    False
9965     True
9966     True
9967     True
9968     True
9969     True
9970     True
9971     True
9972     True
9973     True
9974     True
9975     True
9976     True
9977     True
9978     True
9979     True
9980     True
9981     True
9982     True
9983     True
9984     True
9985     True
9986     True
9987     True
9988     True
Name: Speaker, Length: 9979, dtype: bool

In [359]:
dfa.shape

(9979, 5)

In [360]:
speaker=dfa.groupby('Speaker')

In [361]:
speaker.first().head()

Unnamed: 0_level_0,Utterance,Emotion,Sentiment,Duration_Minutes
Speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1st Customer,Everything was delicious!,joy,positive,0.028533
2nd Customer,It was. The duck in particular was superb.,joy,positive,0.0424
3rd Customer,Actually I do have one small complaint.,neutral,neutral,0.0417
A Female Student,"Yeah, what's up with that girl Monica?",neutral,neutral,0.03265
A Student,What’s happening to your accent?,surprise,negative,0.043083


In [362]:
speaker.get_group('A Student')

Unnamed: 0,Utterance,Emotion,Sentiment,Duration_Minutes
4994,What’s happening to your accent?,surprise,negative,0.043083


In [363]:
speaker.get_group('Chandler')

Unnamed: 0,Utterance,Emotion,Sentiment,Duration_Minutes
0,also I was the point person on my company’s tr...,neutral,neutral,0.094533
2,That I did. That I did.,neutral,neutral,0.049117
4,My duties? All right.,surprise,positive,0.107750
6,I see.,neutral,neutral,0.051433
8,Good to know.,neutral,neutral,0.016683
10,No don’t I beg of you!,fear,negative,0.033367
12,Really?!,surprise,positive,0.050750
21,"Hey, Mon.",neutral,neutral,0.018050
23,Do I ever.,joy,positive,0.030767
25,No way!,surprise,negative,0.017333


In [380]:
 dfa['Speaker'].drop_duplicates(inplace=True)

In [385]:
dfa.shape

(9979, 5)

In [383]:
speaker=dfa.groupby('Speaker')

In [384]:
speaker.get_group('Chandler')

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Duration_Minutes
0,also I was the point person on my company’s tr...,Chandler,neutral,neutral,0.094533


In [366]:
speaker=dfa['Speaker'].value_counts()
print(speaker.head())


Fireman No. 1     1
Mrs. Waltham      1
Issac             1
Kate              1
Rachel/actress    1
Name: Speaker, dtype: int64


In [367]:
dfa.head()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Duration_Minutes
0,also I was the point person on my company’s tr...,Chandler,neutral,neutral,0.094533
1,You must’ve had your hands full.,The Interviewer,neutral,neutral,0.025033
2,That I did. That I did.,Chandler,neutral,neutral,0.049117
3,So let’s talk a little bit about your duties.,The Interviewer,neutral,neutral,0.045867
4,My duties? All right.,Chandler,surprise,positive,0.10775


In [368]:
dfa['Speaker'].nunique()

260

In [369]:
dfa['Speaker']

0                   Chandler
1            The Interviewer
14                      Joey
15                    Rachel
22                    Monica
34                    Phoebe
44                      Ross
61                    Sergei
89                  Customer
94                      Jade
135                     Mona
146                  Charlie
157           Paleontologist
160         Professore Clerk
181                  Caitlin
230                    Nurse
251              Mr. Treeger
267                    Carol
269     The Casting Director
285                    Emily
302                Elizabeth
304                     Paul
319          The Dry Cleaner
322        Joey and Chandler
394                     Kate
401             The Director
428            Mr. Tribbiani
445                 Guru Saj
517                    Wayne
556                  Richard
                ...         
8675      The Acting Teacher
8708           Dr. Franzblau
8796            Mrs. Tedlock
8832          

In [370]:
Emotion=dfa.groupby('Emotion')

In [371]:
Emotion.first()

Unnamed: 0_level_0,Utterance,Speaker,Sentiment,Duration_Minutes
Emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
anger,"Oh no-no-no, give me some specifics.",Joey,negative,0.039633
disgust,Just coffee! Where are we gonna hang out now?,Chandler,negative,0.079917
fear,No don’t I beg of you!,Chandler,negative,0.033367
joy,Do I ever.,Chandler,positive,0.030767
neutral,also I was the point person on my company’s tr...,Chandler,neutral,0.094533
sadness,You know? Forget it!,Rachel,negative,0.036833
surprise,My duties? All right.,Chandler,positive,0.10775


In [372]:
Emotion.size()

Emotion
anger       1108
disgust      271
fear         268
joy         1738
neutral     4708
sadness      683
surprise    1203
dtype: int64

In [373]:
Emotion.agg('max')

Unnamed: 0_level_0,Utterance,Speaker,Sentiment,Duration_Minutes
Emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
anger,if you don’t get that...,Woman,negative,0.2996
disgust,so we finally get to the top of the mountain a...,Woman,negative,0.382317
fear,"‘Cause I kinda all ready told her uh, it was, ...",The Smoking Woman,negative,0.241883
joy,‘Kay!,an,positive,0.684
neutral,‘Sup? ‘Sup dude?,Young Ethan,neutral,0.40455
sadness,is for me not to see you anymore.,Young Ethan,negative,0.31905
surprise,oooooooooooooohhhhhhhhhhh,Tom,positive,0.292633


In [374]:
Emotion.agg('count')

Unnamed: 0_level_0,Utterance,Speaker,Sentiment,Duration_Minutes
Emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
anger,1108,1108,1108,1108
disgust,271,271,271,271
fear,268,268,268,268
joy,1738,1738,1738,1738
neutral,4708,4708,4708,4708
sadness,683,683,683,683
surprise,1203,1203,1203,1203


In [375]:
Emotion.nunique()

Unnamed: 0_level_0,Utterance,Speaker,Sentiment,Duration_Minutes
Emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
anger,1065,82,1,536
disgust,270,32,1,194
fear,264,32,1,207
joy,1589,121,1,689
neutral,4211,209,1,1135
sadness,668,55,1,382
surprise,1006,87,2,517


In [376]:
Emotion.groups

{'anger': Int64Index([  67,   71,   72,   73,   80,   81,   82,   83,   85,  106,
             ...
             9864, 9878, 9880, 9882, 9888, 9940, 9942, 9945, 9957, 9962],
            dtype='int64', length=1108),
 'disgust': Int64Index([  27,  105,  125,  127,  133,  170,  223,  226,  229,  365,
             ...
             9734, 9805, 9807, 9849, 9850, 9889, 9976, 9978, 9982, 9983],
            dtype='int64', length=271),
 'fear': Int64Index([  10,   17,  129,  132,  178,  180,  191,  195,  250,  285,
             ...
             9452, 9456, 9458, 9459, 9515, 9552, 9736, 9762, 9768, 9860],
            dtype='int64', length=268),
 'joy': Int64Index([  23,   31,   33,   44,   51,   52,   53,   62,   64,   68,
             ...
             9910, 9919, 9920, 9928, 9935, 9938, 9946, 9947, 9972, 9988],
            dtype='int64', length=1738),
 'neutral': Int64Index([   0,    1,    2,    3,    5,    6,    7,    8,    9,   11,
             ...
             9970, 9971, 9973, 9974, 9977, 998

In [378]:
dfa.groupby(['Emotion','Sentiment'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f6541d9cd68>

In [None]:
sentiment=dfa.groupby('Sentiment')