In [1]:
from utils import *
from transform import *

In [1]:
is_preprocess=True
is_coref=True
is_char_replace=True

basic_units=3

DATA_PATH='data/IMDB_movie_details.json.zip'

# After event centeric segmetation dataset path
OUTPUT1_PATH=f"data/preprocessed/movie_synopsis_segments_n{basic_units}.pickle"

# After context extraction dataset path
OUTPUT2_PATH=f"data/preprocessed/movie_synopsis_segments_after_n{basic_units}.csv"

# After MAD prediction dataset path
OUTPUT3_PATH="data/movie_dataset/movie_dataset_all.csv"

# Final CharMoral Dataset with label and split
FINAL_PATH="data/movie_dataset/movie_dataset_total.csv"

## 1. Dataset preprocessing

1. basic preprocessing 
2. applying coreference
3. character name replacement

- If you want to skip this process, download [movie_synopsis_preprocess.csv](https://drive.google.com/drive/folders/1vKi83rjJWo9-Bp2kigcY-GLi6nLAN-8-) and put it into `data/preprocessed/` folder.

In [None]:
# unzip dataset
with ZipFile(DATA_PATH, 'r') as zipObj:
    zipObj.extractall('data/')


movie_dataset=pd.read_json("data/IMDB_movie_details.json", lines=True)

plot_synopsis=list(movie_dataset["plot_synopsis"])

plot_synopsis_len=token_len(plot_synopsis)
movie_dataset["plot_synopsis_len"]=plot_synopsis_len

# Remove if the length of 'plot_synopsis_len' lower than 10
drop_idxs=movie_dataset[movie_dataset["plot_synopsis_len"]<10].index
print("number of drops: ", len(drop_idxs))
movie_dataset=movie_dataset.drop(drop_idxs)
movie_dataset=movie_dataset.reset_index(drop=True)

# plot_synopsis preprocessing
if is_preprocess:
    movie_dataset["plot_synopsis"]=preprocess(movie_dataset["plot_synopsis"])

# coreference
if is_coref:
    movie_dataset["plot_synopsis_coref"]=movie_dataset['plot_synopsis'].apply(resolve_coreferences)

# character replacement
if is_char_replace:
    movie_dataset['plot_synopsis_cvt'] = movie_dataset['plot_synopsis_coref'].apply(character_replacement)

## 2. Event-Centric Story Segmentation

- If you want to skip this process, download [movie_synopsis_segments_n3.pickle](https://drive.google.com/drive/folders/1vKi83rjJWo9-Bp2kigcY-GLi6nLAN-8-) and put it into `data/preprocessed/` folder.

In [None]:
output_dict=[]
for i in range(len(movie_dataset)):
    segment_texts=[]
    segment_char=[]
    movie_id=movie_dataset['movie_id'][i]
    movie_genre=movie_dataset["genre"][i]
    text_list=movie_dataset["plot_synopsis_cvt"][i].split(".")
    
    for i in range(0, len(text_list), basic_units):
        merged_element = ".".join(map(str, text_list[i:i + basic_units]))
        segment_texts.append(merged_element)
    
    segment_texts=[x for x in segment_texts if len(x)>10]
    for i in range(len(segment_texts)):
        segment_char.append(find_pattern_in_text(segment_texts[i]))
    
    assert len(segment_texts)==len(segment_char)

    # main character
    character_occurrence=[]
    for i in range(len(segment_char)):
        character_occurrence.extend(segment_char[i])
        segment_char[i]=list(set(segment_char[i]))

    main_character=Counter(character_occurrence).most_common(5)
    
    output_dict.append({
        "movie_id": movie_id,
        "genre": movie_genre,
        "segments": segment_texts,
        "segment_char": segment_char,
        "main_character": main_character
    })

final_dict=[]
for i in range(len(output_dict)):
    segment_char=output_dict[i]["segment_char"]
    segments=output_dict[i]["segments"]
    main_char=output_dict[i]["main_character"]

    merged_segments, merged_characters = merge_segments_with_characters(segments, segment_char)
    
    final_dict.append({
        "movie_id": output_dict[i]["movie_id"],
        "genre": output_dict[i]["genre"],
        "segments": merged_segments,
        "segment_char": merged_characters,
        "main_char": main_char
    })

save_pickle(OUTPUT1_PATH, final_dict)

## 3. Action Extraction

- If you want to skip this process, download [movie_synopsis_segments_after_n3.pickle](https://drive.google.com/drive/folders/1vKi83rjJWo9-Bp2kigcY-GLi6nLAN-8-) and put it into `data/preprocessed/` folder.

In [None]:
output_dict=[]
loaded_data=load_pickle(OUTPUT1_PATH)
for i in tqdm(range(len(loaded_data)), desc='transform',mininterval=0.01):
    segments=loaded_data[i]['segments']
    segment_char=loaded_data[i]['segment_char']

    segment_transform=[]
    
    for seg, seg_char in zip(segments, segment_char):
        output=action_extraction(seg, seg_char)
        segment_transform.append(output)
    
    output_dict.append(segment_transform)
    
loaded_data['segment_transform']=output_dict

In [None]:
output=[]
for i in range(len(loaded_data)):
    for j in range(len(loaded_data[i]['segment_transform'])):
        tmp=loaded_data[i]['segment_transform'][j]
        tmp=tmp.strip('<start>').strip('<end>').replace('"','').split('\n')
        
        for k in range(len(tmp)):
            if (len(tmp[k])>2) and (":" in tmp[k]):
                split=tmp[k].replace(',','').split(':')

                character_name=split[0].strip('[').strip(']')
                action=" ".join(split[1:])

                output.append({
                    'mid': i,
                    'sid': j,
                    'movie_id': loaded_data[i]['movie_id'],
                    'genre': loaded_data[i]['genre'],
                    'rating': loaded_data[i]['rating'],
                    'plot_synopsis_len': loaded_data[i]['plot_synopsis_len'],
                    'plot_synopsis_cvt': loaded_data[i]['plot_synopsis_len'],
                    'segment_token_len': len(loaded_data[i]['segments'][j].split(' ')),
                    'segment': loaded_data[i]['segments'][j],
                    'segment_char': character_name,
                    'segment_action': action,
                    })

output_df=pd.DataFrame(output)
output_df['segment_action']=output_df['segment_action'].str.replace('[','').str.replace(']','')
output_df=output_df.reset_index(drop=True)

output_df.to_csv(OUTPUT2_PATH, index=False)

## 4. Context Extraction

- If you want to skip this process, 
1) download [movie_synopsis_segments_after_n3.pickle](https://drive.google.com/drive/folders/1vKi83rjJWo9-Bp2kigcY-GLi6nLAN-8-) and put it into `data/preprocessed/` folder.
1) download [inference_n3.tsv](https://drive.google.com/drive/folders/1uu_QIRIc4snbwdyl5w3H8DEPCKSG-LZT) and put it into `data/moral_stories_dataset/` folder.

In [None]:
loaded_data=pd.read_csv(OUTPUT2_PATH)

segment_transform=[]
skip=0
for i in tqdm(range(len(loaded_data)), desc='context transform',mininterval=0.01):    
    segment_char=loaded_data['segment_char'][i]
    segment_action=loaded_data['segment_action'][i]
    segment=loaded_data['segment'][i]
    
    if "no action" in str(segment_action) :
        print(f'skip: {segment_action}')
        skip+=1
        segment_transform.append('not exist')
    else:
        output=context_extraction(segment_char, segment_action, segment)
        segment_transform.append(output)

assert len(loaded_data)==len(segment_transform)

print(f"skip: {skip}")
loaded_data['transform']=segment_transform

In [None]:
# Post preprocessing

situation=[]
intention=[]
consequence=[]

for i in tqdm(range(len(loaded_data))):
    tmp=loaded_data['transform'][i]
    tmp=tmp.strip('<start>').strip('<end>').replace('"','').split('\n')
    
    for k in range(len(tmp)):
        if (len(tmp[k])>2) and (":" in tmp[k]):
            split=tmp[k].split(':')
            tag=split[0].strip('[').strip(']')
            if tag.lower()=='situation':
                situation.append(split[1])
            elif tag.lower()=='intention':
                intention.append(split[1])
            elif tag.lower()=='consequence':
                consequence.append(split[1])
        
    if len(situation)!=i+1 : situation.append('not exist')
    if len(intention)!=i+1 : intention.append('not exist')
    if len(consequence)!=i+1 : consequence.append('not exist')
        
    print(i, len(situation), len(intention), len(consequence))
    assert len(situation)==len(intention)==len(consequence)

loaded_data['intention'], loaded_data['situation'], loaded_data['consequence'] = intention, situation, consequence

# Drop the 'transform' column and rename 'segment_action' to 'action'
loaded_data = loaded_data.drop(columns=['transform'])
loaded_data = loaded_data.rename(columns={'segment_action': 'action'})
loaded_data['intention']=loaded_data['intention'].str.replace('[','').str.replace(']','')
loaded_data['situation']=loaded_data['situation'].str.replace('[','').str.replace(']','')
loaded_data['consequence']=loaded_data['consequence'].str.replace('[','').str.replace(']','')

def mask_characters(row, column, char_to_mask):
    mask = '[mask]'
    if pd.notnull(row[column]):
        return re.sub(char_to_mask, mask, str(row[column]), flags=re.IGNORECASE)
    return row[column]

for i in tqdm(range(len(loaded_data))):
    char_to_mask = re.escape(str(loaded_data.at[i, 'segment_char']))
    loaded_data.at[i, 'action'] = mask_characters(loaded_data.iloc[i], 'action', char_to_mask)
    loaded_data.at[i, 'intention'] = mask_characters(loaded_data.iloc[i], 'intention', char_to_mask)
    loaded_data.at[i, 'situation'] = mask_characters(loaded_data.iloc[i], 'situation', char_to_mask)
    loaded_data.at[i, 'consequence'] = mask_characters(loaded_data.iloc[i], 'consequence', char_to_mask)


In [None]:
# save path for morality labeling using fine-tuned MAD
loaded_data.to_csv("data/preprocessed/inference_n3.tsv", sep='\t', index=True)
loaded_data.to_csv(OUTPUT2_PATH, index=True)

## 5. Morality Prediction Using MAD

- MAD fine-tuning

    ```
    bash train_cls.sh
    ```

- Inference

    ```
    bash test_cls.sh
    ```


In [None]:
# labeled dataset path
loaded_data=pd.read_csv(FINAL_PATH)

# morality labeling
label=[]
for i in range(len(loaded_data)):
    if "no action" in loaded_data['action'][i]:
        label.append(-1)
    else:
        if loaded_data['SICA'][i]>=0.5: label.append(1)
        else: label.append(0)

loaded_data['label']=label

In [None]:
# Train, Valid, Test split
train=loaded_data[:101686].reset_index(drop=True)
valid=loaded_data[101686:115046].reset_index(drop=True)
test=loaded_data[115046:].reset_index(drop=True)

print(len(train), len(valid), len(test)) # 101686 13360 12255
print(len(train) + len(valid) + len(test)) # 127301

train['split']=['train']*len(train)
valid['split']=['valid']*len(valid)
test['split']=['test']*len(test)

data=pd.concat([train, valid, test])
data.reset_index(drop=True)

In [None]:
data.to_csv(FINAL_PATH, index=False)
train.to_csv('data/movie_dataset/movie_dataset_train.csv', index=True)
valid.to_csv('data/movie_dataset/movie_dataset_valid.csv', index=True)
test.to_csv('data/movie_dataset/movie_dataset_test.csv', index=True)