# scene_clustering
This notebook contains inital code for clustering frames into shots, identifying the A/B/A/B pattern, and using the image classifier model to see if they're MCUs

In [1]:
import sys
import os
sys.path.append('site-packages') # manually put all packages/libraries into this folder
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from keras import models
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering

Using TensorFlow backend.


## Clustering
### For POC, designating a specific scene's worth of frames

In [47]:
# input film and frame 
# film = 'booksmart'
# frame_choice = list(range(1001, 1163)) # good example for Booksmart!!! 6 clusters, 2500 distance_threshold

# film = 'hobbs_shaw'
# frame_choice = list(range(701, 858)) # Hobbs and Shaw, Jason Statham vs. Helen Mirren, 2,500

# film = 'parasite'
# frame_choice = list(range(1666, 1836)) # Parasite, tough because Mrs. Park and Jessica are both right-oc

film = 'parasite'
frame_choice = list(range(6687, 6777)) # Parasite, tough because Mrs. Park and Jessica are both right-oc

In [48]:
# establish folder for this film
dialogue_folder = os.path.join('dialogue_frames', film)

print('There are', len(os.listdir(dialogue_folder)), 'images in the folder')
print('Selected', len(frame_choice), 'of those frames')

There are 7782 images in the folder
Selected 90 of those frames


In [49]:
model = VGG16(weights='imagenet', include_top=False)
model.summary()

vgg16_feature_list = []


for x in frame_choice:
    img_path = dialogue_folder + '/' + film + '_frame'+ str(x) + '.jpg'
    img = image.load_img(img_path, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)

    vgg16_feature = model.predict(img_data)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_feature_list.append(vgg16_feature_np.flatten())

    x += 1

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

In [50]:
vgg16_feature_list_np = np.array(vgg16_feature_list)
vgg16_feature_list_np.shape

(90, 25088)

In [105]:
hac = AgglomerativeClustering(n_clusters = None, distance_threshold = 2900).fit(vgg16_feature_list_np)
print('Number of clusters:', hac.n_clusters_)
print(hac.labels_)

Number of clusters: 5
[1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 0 0 0 0 4 4 4 4
 4 4 4 4 4 4 4 4 0 0 0 0 0 0 4 4 1 1 1 0 0 0 0 4 4 0 0 0 0 3 3 3 3 3 3 3 3
 3 3 3 3 0 0 3 3 3 3 3 3 0 0 0 0]


## Load Saved Model and Identify MCUs

In [13]:
tuned_model = models.load_model('saved_models/tuned_model')

In [94]:
image_list = []
for x in frame_choice:
    image_list.append(img_to_array(load_img(dialogue_folder + '/' + film + '_frame'+ str(x) + '.jpg', target_size = (128, 128), color_mode = 'grayscale')))

In [95]:
image_array = np.array(image_list)
y_pred = tuned_model.predict_classes(image_array)

In [96]:
for frame, cluster, prediction in zip(frame_choice, hac.labels_, list(y_pred)):
    print(frame, cluster, prediction[0])

6687 1 0
6688 1 0
6689 1 0
6690 1 0
6691 1 1
6692 1 0
6693 2 0
6694 2 0
6695 2 0
6696 2 0
6697 2 0
6698 2 0
6699 2 1
6700 2 1
6701 2 1
6702 2 0
6703 2 1
6704 2 0
6705 2 0
6706 2 1
6707 2 0
6708 2 1
6709 2 1
6710 2 1
6711 2 1
6712 4 1
6713 4 1
6714 4 1
6715 4 1
6716 0 1
6717 0 1
6718 0 1
6719 0 1
6720 4 1
6721 4 1
6722 4 1
6723 4 1
6724 4 1
6725 4 1
6726 4 0
6727 4 1
6728 4 1
6729 4 1
6730 4 0
6731 4 0
6732 0 1
6733 0 1
6734 0 1
6735 0 1
6736 0 1
6737 0 1
6738 4 1
6739 4 1
6740 1 1
6741 1 1
6742 1 1
6743 0 0
6744 0 1
6745 0 1
6746 0 1
6747 4 1
6748 4 1
6749 0 1
6750 0 1
6751 0 1
6752 0 1
6753 3 1
6754 3 1
6755 3 1
6756 3 1
6757 3 1
6758 3 1
6759 3 1
6760 3 1
6761 3 1
6762 3 1
6763 3 1
6764 3 1
6765 0 1
6766 0 1
6767 3 1
6768 3 1
6769 3 1
6770 3 1
6771 3 1
6772 3 1
6773 0 1
6774 0 1
6775 0 1
6776 0 1


In [97]:
y_pred_values = []
for prediction in y_pred:
    y_pred_values.append(prediction[0])

In [98]:
scene_df = pd.DataFrame(zip(frame_choice, hac.labels_, y_pred_values), columns=['frame_file', 'cluster', 'mcu'])
scene_df

Unnamed: 0,frame_file,cluster,mcu
0,6687,1,0
1,6688,1,0
2,6689,1,0
3,6690,1,0
4,6691,1,1
...,...,...,...
85,6772,3,1
86,6773,0,1
87,6774,0,1
88,6775,0,1


In [101]:
for x in range(0, 5):
    print(x, scene_df.loc[scene_df['cluster'] == x]['mcu'].mean())

0 0.9583333333333334
1 0.4444444444444444
2 0.47368421052631576
3 1.0
4 0.85


In [102]:
scene_df.loc[scene_df['frame_file'] == 750]

Unnamed: 0,frame_file,cluster,mcu


In [103]:
scene_df.loc[scene_df['cluster'] == 2]

Unnamed: 0,frame_file,cluster,mcu
6,6693,2,0
7,6694,2,0
8,6695,2,0
9,6696,2,0
10,6697,2,0
11,6698,2,0
12,6699,2,1
13,6700,2,1
14,6701,2,1
15,6702,2,0


In [26]:
pd.options.display.max_rows=200

In [27]:
scene_df.loc[scene_df['mcu'] == 1]

Unnamed: 0,frame_file,cluster,mcu
0,701,1,1
3,704,1,1
4,705,0,1
7,708,4,1
8,709,4,1
9,710,4,1
18,719,0,1
19,720,0,1
20,721,0,1
21,722,0,1


### Initial pass at scene pattern algorithm

In [104]:

# prev_cluster changes every single frame
# stored_cluster changes only on cluster change
# alternate_counter changes only on cluster change

prev_cluster = 1000
stored_cluster = 1000
alternate_counter = 1
print('frame\t', 'mcu\t', 'clust\t', 'prev\t', 'stored\t', 'alternate')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    if stored_cluster == 1000:
        stored_cluster = prev_cluster
    elif cluster != prev_cluster:
        if cluster == stored_cluster:
            alternate_counter += 1
        else:
            alternate_counter = 1
        stored_cluster = prev_cluster
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    prev_cluster = cluster
    

frame	 mcu	 clust	 prev	 stored	 alternate
6687 	 0 	 1 	 1000 	 1000 	 1
6688 	 0 	 1 	 1 	 1 	 1
6689 	 0 	 1 	 1 	 1 	 1
6690 	 0 	 1 	 1 	 1 	 1
6691 	 1 	 1 	 1 	 1 	 1
6692 	 0 	 1 	 1 	 1 	 1
6693 	 0 	 2 	 1 	 1 	 1
6694 	 0 	 2 	 2 	 1 	 1
6695 	 0 	 2 	 2 	 1 	 1
6696 	 0 	 2 	 2 	 1 	 1
6697 	 0 	 2 	 2 	 1 	 1
6698 	 0 	 2 	 2 	 1 	 1
6699 	 1 	 2 	 2 	 1 	 1
6700 	 1 	 2 	 2 	 1 	 1
6701 	 1 	 2 	 2 	 1 	 1
6702 	 0 	 2 	 2 	 1 	 1
6703 	 1 	 2 	 2 	 1 	 1
6704 	 0 	 2 	 2 	 1 	 1
6705 	 0 	 2 	 2 	 1 	 1
6706 	 1 	 2 	 2 	 1 	 1
6707 	 0 	 2 	 2 	 1 	 1
6708 	 1 	 2 	 2 	 1 	 1
6709 	 1 	 2 	 2 	 1 	 1
6710 	 1 	 2 	 2 	 1 	 1
6711 	 1 	 2 	 2 	 1 	 1
6712 	 1 	 4 	 2 	 2 	 1
6713 	 1 	 4 	 4 	 2 	 1
6714 	 1 	 4 	 4 	 2 	 1
6715 	 1 	 4 	 4 	 2 	 1
6716 	 1 	 0 	 4 	 4 	 1
6717 	 1 	 0 	 0 	 4 	 1
6718 	 1 	 0 	 0 	 4 	 1
6719 	 1 	 0 	 0 	 4 	 1
6720 	 1 	 4 	 0 	 0 	 2
6721 	 1 	 4 	 4 	 0 	 2
6722 	 1 	 4 	 4 	 0 	 2
6723 	 1 	 4 	 4 	 0 	 2
6724 	 1 	 4 	 4 	 0 	 2
6

### Same algorithm, but only displaying clusters with alternate_count > 3

In [30]:
prev_cluster = 1000
stored_cluster = 1000
alternate_counter = 1
print('frame\t', 'mcu\t', 'clust\t', 'prev\t', 'stored\t', 'alternate')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    if stored_cluster == 1000:
        stored_cluster = prev_cluster
    elif cluster != prev_cluster:
        if cluster == stored_cluster:
            alternate_counter += 1
        else:
            alternate_counter = 1
        stored_cluster = prev_cluster
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    if alternate_counter >= 3:
        print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    prev_cluster = cluster
    

frame	 mcu	 clust	 prev	 stored	 alternate
1732 	 1 	 2 	 1 	 1 	 3
1733 	 1 	 2 	 2 	 1 	 3
1734 	 1 	 2 	 2 	 1 	 3
1735 	 1 	 2 	 2 	 1 	 3
1736 	 1 	 2 	 2 	 1 	 3
1737 	 1 	 2 	 2 	 1 	 3
1738 	 1 	 2 	 2 	 1 	 3
1739 	 0 	 2 	 2 	 1 	 3
1740 	 0 	 2 	 2 	 1 	 3
1741 	 0 	 2 	 2 	 1 	 3
1742 	 1 	 2 	 2 	 1 	 3
1743 	 1 	 2 	 2 	 1 	 3
1744 	 1 	 2 	 2 	 1 	 3
1745 	 1 	 2 	 2 	 1 	 3
1746 	 1 	 1 	 2 	 2 	 4
1747 	 0 	 1 	 1 	 2 	 4
1748 	 0 	 1 	 1 	 2 	 4
1752 	 1 	 8 	 1 	 1 	 3
1753 	 1 	 8 	 8 	 1 	 3
1754 	 1 	 8 	 8 	 1 	 3
1755 	 1 	 8 	 8 	 1 	 3
1756 	 1 	 8 	 8 	 1 	 3
1775 	 1 	 9 	 8 	 8 	 3
1776 	 1 	 9 	 9 	 8 	 3
1777 	 1 	 9 	 9 	 8 	 3
1778 	 1 	 9 	 9 	 8 	 3
1779 	 1 	 8 	 9 	 9 	 4
1780 	 1 	 8 	 8 	 9 	 4
1781 	 1 	 8 	 8 	 9 	 4
1782 	 1 	 8 	 8 	 9 	 4
1783 	 1 	 8 	 8 	 9 	 4
1784 	 1 	 8 	 8 	 9 	 4
1785 	 1 	 8 	 8 	 9 	 4
1786 	 1 	 8 	 8 	 9 	 4
1787 	 1 	 8 	 8 	 9 	 4
1788 	 0 	 8 	 8 	 9 	 4
1789 	 1 	 8 	 8 	 9 	 4
1790 	 0 	 8 	 8 	 9 	 4
1791 	 

### Allow for One Extra Storage (for inserts)

In [114]:
prev_cluster = 1000
stored_cluster = [1000, 1001]
alternate_counter = 1
print('frame\t', 'mcu\t', 'clust\t', 'prev\t', 'stored\t', 'alternate')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    if 1000 in stored_cluster:
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    elif cluster != prev_cluster:
        if cluster in stored_cluster:
            alternate_counter += 1
        else:
            alternate_counter = 1
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    prev_cluster = cluster
    

frame	 mcu	 clust	 prev	 stored	 alternate
6687 	 0 	 1 	 1000 	 [1001, 1000] 	 1
6688 	 0 	 1 	 1 	 [1000, 1] 	 1
6689 	 0 	 1 	 1 	 [1, 1] 	 1
6690 	 0 	 1 	 1 	 [1, 1] 	 1
6691 	 1 	 1 	 1 	 [1, 1] 	 1
6692 	 0 	 1 	 1 	 [1, 1] 	 1
6693 	 0 	 2 	 1 	 [1, 1] 	 1
6694 	 0 	 2 	 2 	 [1, 1] 	 1
6695 	 0 	 2 	 2 	 [1, 1] 	 1
6696 	 0 	 2 	 2 	 [1, 1] 	 1
6697 	 0 	 2 	 2 	 [1, 1] 	 1
6698 	 0 	 2 	 2 	 [1, 1] 	 1
6699 	 1 	 2 	 2 	 [1, 1] 	 1
6700 	 1 	 2 	 2 	 [1, 1] 	 1
6701 	 1 	 2 	 2 	 [1, 1] 	 1
6702 	 0 	 2 	 2 	 [1, 1] 	 1
6703 	 1 	 2 	 2 	 [1, 1] 	 1
6704 	 0 	 2 	 2 	 [1, 1] 	 1
6705 	 0 	 2 	 2 	 [1, 1] 	 1
6706 	 1 	 2 	 2 	 [1, 1] 	 1
6707 	 0 	 2 	 2 	 [1, 1] 	 1
6708 	 1 	 2 	 2 	 [1, 1] 	 1
6709 	 1 	 2 	 2 	 [1, 1] 	 1
6710 	 1 	 2 	 2 	 [1, 1] 	 1
6711 	 1 	 2 	 2 	 [1, 1] 	 1
6712 	 1 	 4 	 2 	 [1, 2] 	 1
6713 	 1 	 4 	 4 	 [1, 2] 	 1
6714 	 1 	 4 	 4 	 [1, 2] 	 1
6715 	 1 	 4 	 4 	 [1, 2] 	 1
6716 	 1 	 0 	 4 	 [2, 4] 	 1
6717 	 1 	 0 	 0 	 [2, 4] 	 1
6718 	 1 	 0 	 