# 전처리 과정 설명

[MSRVTT 데이터 설명]
데이터 구성: 각 video는 대략 10~20초 사이의 영상이며, 20개의 captions(sentences)로 구성됨.

1. Sentiment Analysis
* Sentiment Analysis를 통해 한 개의 video에서 sentiment 정보가 존재하는지 판단. $\rightarrow$ [nltk.sentiment.SentimentIntensityAnalyzer](https://www.nltk.org/api/nltk.sentiment.SentimentIntensityAnalyzer.html?highlight=sentimentintensity) 활용.
* 판단 방식은 compound sentiment score가 0.6 이상이면 긍정(positive) 정보라고 판단하고, -0.6 이하면 부정(negative) 정보라고 판단.
* 결과물: 7,010 개의 video에서 4,802 개의 데이터로 정제.

2. Assigning Emotional Information
* Video 정제 과정을 마친 후(1번 과정), 각 video에서 각 caption이 어떤 emotional 정보를 가지고 있는지 판단. $\rightarrow$ [nrclex.NRCLex](https://pypi.org/project/NRCLex/) 활용.
* NRCLex는 사전에 정의된 dictionary 기반 scoring 방식으로 문장에 존재하는 8개의 emotion 정보를 counting. 추가로 positive와 negative 정보도 counting.
* 각 caption에 대해 
* 해당 과정을 통해 총 11개의 추가 columns이 생성됨.




In [2]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from tqdm import tqdm

# Data Load

In [3]:
# Load the data
mrsvtt_train9k_dir = "msrvtt_ret_train9k.json"
mrsvtt_test1k_dir = "msrvtt_ret_test1k.json"

with open(mrsvtt_train9k_dir, 'r') as f:
    mrsvtt_train9k_json = json.load(f)

with open(mrsvtt_test1k_dir, 'r') as f:
    mrsvtt_test1k_json = json.load(f)

categories = [
    "music",
    "people",
    "gaming",
    "sports/actions",
    "news/events/politics",
    "education",
    "tv shows",
    "movie/comedy",
    "animation",
    "vehicles/autos",
    "howto",
    "travel",
    "science/technology",
    "animals/pets",
    "kids/family",
    "documentary",
    "food/drink",
    "cooking",
    "beauty/fashion",
    "advertisement"
]


In [4]:
# Annotation data
train9k = pd.DataFrame(mrsvtt_train9k_json)
test1k = pd.DataFrame(mrsvtt_test1k_json)
train9k['video_id'] = train9k['video'].apply(lambda x: x.split('.')[0])
test1k['video_id'] = test1k['video'].apply(lambda x: x.split('.')[0])
print(train9k.shape, test1k.shape)

(180000, 4) (1000, 4)


In [10]:
test1k.head()

Unnamed: 0,video,caption,duration,video_id
0,video9770.mp4,a person is connecting something to system,10.8,video9770
1,video9771.mp4,a little girl does gymnastics,13.78,video9771
2,video7020.mp4,a woman creating a fondant baby and flower,11.31,video7020
3,video9773.mp4,a boy plays grand theft auto 5,14.52,video9773
4,video7026.mp4,a man is giving a review on a vehicle,13.62,video7026


In [5]:
merge_caption = lambda x: '. '.join(x)

#  Group by each video_id
train_video_text = train9k.groupby('video_id').aggregate('caption').apply(merge_caption)
test_video_text = test1k.groupby('video_id').aggregate('caption').apply(merge_caption)
train_video_text = pd.DataFrame(train_video_text).reset_index()
test_video_text = pd.DataFrame(test_video_text).reset_index()
print(train_video_text.shape, test_video_text.shape)

(9000, 2) (1000, 2)


In [6]:
train_video_text.head()

Unnamed: 0,video_id,caption
0,video0,a car is shown. a group is dancing. a man driv...
1,video1,in a kitchen a woman adds different ingredient...
2,video10,a man holds two dogs. a man introducing he two...
3,video100,a basset hound sits outside a door. a breed do...
4,video1000,a woman is wearing a costume. a woman talking ...


In [19]:
test_video_text.head()

Unnamed: 0,video_id,caption
0,video7020,a woman creating a fondant baby and flower
1,video7021,baseball player hits ball
2,video7024,little pet shop cat getting a bath and washed ...
3,video7025,a naked child runs through a field
4,video7026,a man is giving a review on a vehicle


In [7]:
test_video_text.shape

(1000, 2)

# Data Preprocessing

## 1. Extract Emotional Data using Sentiment Analysis

In [8]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ones/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

### Functions

In [9]:
def extract_emotion_manually(df, text=None):
    # assign the default text
    if text is None:
        text = "happy|sad|afraid|fear|surprise|joy|disgust|annoy| anger|angry|" \
                "excite|excited|exciting|scare|scared|scary|fright|frighten|frightened|frightening" \
                "|fearful|fearless|fearfully"
    # extract the data that contains the text
    manual_df = df[df['caption'].str.contains(text)]
    print("Number of manually selecting data:", len(manual_df))
    return manual_df


def extract_emotional_data(df, sent_bound=0.6, manual=False):
    # Initialize SentimentIntensityAnalyzer
    sia = SentimentIntensityAnalyzer()
    emotion_df = pd.DataFrame()

    # extract the emotional data using the sentiment analysis
    for idx in tqdm(range(len(df))):
        sentence = df.caption[idx]
        # calculate the sentiment score
        sentiment_score = sia.polarity_scores(sentence)
        # determine the emotional data: compound score > 0.6 (positive) or < -0.6 (negative)
        if sentiment_score['compound'] > sent_bound or sentiment_score['compound'] < -sent_bound:
            emotion_df = pd.concat([emotion_df, df.iloc[idx:idx+1, :]])

    print("Number of emotional data:", len(emotion_df))

    # extract the data that contains predefined emotion words
    if manual:
        manual_df  = extract_emotion_manually(df)
        # extract the data that is not in the emotion_df
        if "sen_id" in manual_df.columns:
            only_manual_df = pd.merge(manual_df, emotion_df, on='sen_id', how='outer', indicator=True).query('_merge=="left_only"')
            only_manual_df = only_manual_df.drop(columns=["caption_y", "video_id_y", "_merge"]).rename(columns={"caption_x": "caption", "video_id_x": "video_id"})
        else:
            only_manual_df = pd.merge(manual_df, emotion_df, on='video_id', how='outer', indicator=True).query('_merge=="left_only"')
            only_manual_df = only_manual_df.drop(columns=["caption_y", "_merge"]).rename(columns={"caption_x": "caption"})
        print("Number of data only in manual data:", len(only_manual_df))
        emotion_df = pd.concat([emotion_df, only_manual_df])
        print("Toal number of emotional data:", len(emotion_df))
        return emotion_df

    return emotion_df

In [12]:
# video
train_df = extract_emotional_data(train_video_text, sent_bound=0.6, manual=True)
emotion_test_df = extract_emotional_data(test_video_text, sent_bound=0.6, manual=True)
print(train_df.shape, emotion_test_df.shape)

  0%|          | 0/9000 [00:00<?, ?it/s]

100%|██████████| 9000/9000 [00:13<00:00, 654.20it/s]


Number of emotional data: 5984
Number of manually selecting data: 1019
Number of data only in manual data: 153
Toal number of emotional data: 6137


100%|██████████| 1000/1000 [00:00<00:00, 7163.53it/s]

Number of emotional data: 29
Number of manually selecting data: 7
Number of data only in manual data: 5
Toal number of emotional data: 34
(6137, 2) (34, 2)





In [27]:
num_emo_test = emotion_test_df.shape[0]
no_emotion_test_df = test_video_text[~test_video_text.video_id.isin(emotion_test_df.video_id)].sample(num_emo_test)
test_df = pd.concat([emotion_test_df, no_emotion_test_df])
test_df

Unnamed: 0,video_id,caption
15,video7112,while other friends too try and hitting the ba...
21,video7118,a young girl in a horror movie is haunted
66,video7200,a female soccer player accepts a reward while ...
80,video7216,a woman giving skin care tips
137,video7410,vladmir putin talks on the news about the figh...
...,...,...
791,video9323,it is the video of military men
802,video9334,in game footage of a mine craft character walk...
839,video9450,a foreign military themed show
160,video7500,a soccer team walking out on the field


In [31]:
emotion_test = test1k[test1k.video_id.isin(test_df.video_id)].loc[:, ['video', 'caption', 'duration']]
emotion_test.sort_values(by='video', inplace=True)
emotion_test

Unnamed: 0,video,caption,duration
37,video7112.mp4,while other friends too try and hitting the ba...,18.35
31,video7118.mp4,a young girl in a horror movie is haunted,13.32
680,video7144.mp4,a group of women are rubbing oil and milk all ...,11.60
226,video7168.mp4,he is playing with ball,11.18
841,video7200.mp4,a female soccer player accepts a reward while ...,28.93
...,...,...,...
579,video9811.mp4,a group of actors sit in a control room and th...,20.44
584,video9814.mp4,a scene from spongebob squarepants where the t...,11.80
586,video9816.mp4,a little girl talking to her and is scared,13.35
349,video9825.mp4,an intelligent man with glasses talk about cer...,18.81


* save the data

In [34]:
# save data to json and csv
emotion_test.to_json('emotion_test_df.json', orient='records', lines=False)
emotion_test.to_csv('emotion_test_df.csv', index=False)

* check the score

In [35]:
# sentiment score
sia = SentimentIntensityAnalyzer()

count = 0
for text in emotion_test.caption:
    if count > 10:
        break
    print(text)
    print(sia.polarity_scores(text))
    print()
    count += 1

while other friends too try and hitting the basket another is eager to achieve his fourth successful basket in basketball
{'neg': 0.0, 'neu': 0.644, 'pos': 0.356, 'compound': 0.8555}

a young girl in a horror movie is haunted
{'neg': 0.576, 'neu': 0.424, 'pos': 0.0, 'compound': -0.7783}

a group of women are rubbing oil and milk all over a woman
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

he is playing with ball
{'neg': 0.0, 'neu': 0.69, 'pos': 0.31, 'compound': 0.2023}

a female soccer player accepts a reward while being cheered on by the crowd
{'neg': 0.0, 'neu': 0.492, 'pos': 0.508, 'compound': 0.8519}

a woman giving skin care tips
{'neg': 0.0, 'neu': 0.349, 'pos': 0.651, 'compound': 0.6808}

a tv shows review program hosts discuss about the performance and staying on air of star trek
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

vladmir putin talks on the news about the fight against terrorism
{'neg': 0.444, 'neu': 0.556, 'pos': 0.0, 'compound': -0.802}

jolly 

# 2. Assign Specific Emotion

In [32]:
from nrclex import NRCLex


emostion_list = [
    "joy",
    "trust",
    "fear",
    "surprise",
    "sadness",
    "disgust",
    "anger",
    "anticipation"
]


### Functions

In [33]:
def create_sentiment_columns(df):
    """ Create sentiment columns using NRC lexicon
        : positive, negative, neutral
    """
    df = df.reset_index(drop=True)
    for idx in tqdm(range(df.shape[0])):
        emotion_counts = NRCLex(df.caption.iloc[idx]).raw_emotion_scores
        # positive case
        if 'positive' in emotion_counts.keys():
            df.loc[idx, 'positive'] = emotion_counts['positive']
        # negative case
        if 'negative' in emotion_counts.keys():
            df.loc[idx, 'negative'] = emotion_counts['negative']
        # neutral case
        if 'positive' not in emotion_counts.keys() and 'negative' not in emotion_counts.keys():
            df.loc[idx, 'neutral'] = 1

    df.fillna({'positive': 0, 'negative': 0, 'neutral': 0}, inplace=True)
    return df

def create_emotion_columns(df):
    """ Create emotion columns using NRC lexicon
        : anger, anticipation, disgust, fear, joy, sadness, surprise, trust
    """
    df = df.reset_index(drop=True)
    for idx in tqdm(range(df.shape[0])):
        emotion_counts = NRCLex(df.caption.iloc[idx]).raw_emotion_scores
        # create the emotion columns
        for emo in emostion_list:
            if emo in emotion_counts.keys():
                df.loc[idx, emo] = emotion_counts[emo]

    df.fillna({emo: 0 for emo in emostion_list}, inplace=True)
    return df

In [39]:
emotion_test_df = pd.read_csv("emotion_test_df.csv")
emotion_test_df['video_id'] = emotion_test_df.video.apply(lambda x: x.split('.')[0])
emotion_test_df.head()

Unnamed: 0,video,caption,duration,video_id
0,video7112.mp4,while other friends too try and hitting the ba...,18.35,video7112
1,video7118.mp4,a young girl in a horror movie is haunted,13.32,video7118
2,video7144.mp4,a group of women are rubbing oil and milk all ...,11.6,video7144
3,video7168.mp4,he is playing with ball,11.18,video7168
4,video7200.mp4,a female soccer player accepts a reward while ...,28.93,video7200


In [40]:
# create the sentiment columns
emotion_test_df = create_sentiment_columns(emotion_test_df)

  0%|          | 0/68 [00:00<?, ?it/s]

100%|██████████| 68/68 [00:00<00:00, 627.98it/s]


In [41]:
# create the emotion columns
emotion_test_df = create_emotion_columns(emotion_test_df)

100%|██████████| 68/68 [00:00<00:00, 705.94it/s]


In [43]:
emotion_test_df.head()

Unnamed: 0,video,caption,duration,video_id,positive,negative,neutral,joy,trust,surprise,anticipation,fear,sadness,disgust,anger
0,video7112.mp4,while other friends too try and hitting the ba...,18.35,video7112,4.0,0.0,0.0,4.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0
1,video7118.mp4,a young girl in a horror movie is haunted,13.32,video7118,1.0,2.0,0.0,1.0,0.0,2.0,1.0,2.0,2.0,1.0,1.0
2,video7144.mp4,a group of women are rubbing oil and milk all ...,11.6,video7144,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,video7168.mp4,he is playing with ball,11.18,video7168,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,video7200.mp4,a female soccer player accepts a reward while ...,28.93,video7200,2.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0


In [44]:
emotion_test_df.to_csv('emotion_test_df.csv', index=False)
emotion_test_df.to_json('emotion_test_df.json', orient='records', lines=False)

In [45]:
with open('emotion_test_df.json', 'r') as f:
    emotion_test_json = json.load(f)