In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [32]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os
from tqdm import tqdm
import pickle
import sys
sys.path.append('..')
import library

In [3]:
import subprocess
import shlex
import json

def findVideoMetada(pathToInputVideo):
    cmd = "ffprobe -v quiet -print_format json -show_streams"
    args = shlex.split(cmd)
    args.append(pathToInputVideo)
    ffprobeOutput = subprocess.check_output(args).decode('utf-8')
    ffprobeOutput = json.loads(ffprobeOutput)

#     import pprint
#     pp = pprint.PrettyPrinter(indent=2)
#     pp.pprint(ffprobeOutput)

    # for example, find height and width
#     height = ffprobeOutput['streams'][0]['height']
#     width = ffprobeOutput['streams'][0]['width']
    return ffprobeOutput['streams'][0]

In [4]:
findVideoMetada('../data/original/000.mp4')

[{'index': 0,
  'codec_name': 'h264',
  'codec_long_name': 'H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10',
  'profile': 'High 4:4:4 Predictive',
  'codec_type': 'video',
  'codec_time_base': '1/50',
  'codec_tag_string': 'avc1',
  'codec_tag': '0x31637661',
  'width': 640,
  'height': 480,
  'coded_width': 640,
  'coded_height': 480,
  'has_b_frames': 2,
  'pix_fmt': 'yuv444p',
  'level': 30,
  'chroma_location': 'left',
  'refs': 1,
  'is_avc': 'true',
  'nal_length_size': '4',
  'r_frame_rate': '25/1',
  'avg_frame_rate': '25/1',
  'time_base': '1/12800',
  'start_pts': 0,
  'start_time': '0.000000',
  'duration_ts': 202752,
  'duration': '15.840000',
  'bit_rate': '402660',
  'bits_per_raw_sample': '8',
  'nb_frames': '396',
  'disposition': {'default': 1,
   'dub': 0,
   'original': 0,
   'comment': 0,
   'lyrics': 0,
   'karaoke': 0,
   'forced': 0,
   'hearing_impaired': 0,
   'visual_impaired': 0,
   'clean_effects': 0,
   'attached_pic': 0,
   'timed_thumbnails': 0},
  'tags': {'l

In [4]:
def create_compressed_dataset(source_path, target_path):
    for i in tqdm(os.scandir(source_path)):
        path = i.path.replace('\\', '/')
        metadata = findVideoMetada(path)
        assert len(metadata) in [1, 2], f'{metadata}'
        old_bitrate = int(metadata['bit_rate'])
        bitrate_percentages = [0.77, 0.6]
        crfs = [23, 40]
        new_bitrates = [int(old_bitrate*perc) for perc in bitrate_percentages]
        
        if not os.path.exists(f'{target_path}'):
            os.mkdir(f'{target_path}')
        
        for perc, bitrate, crf in zip(bitrate_percentages, new_bitrates, crfs):
            if not os.path.exists(f'{target_path}/{perc}'):
                os.mkdir(f'{target_path}/{perc}')
            cmd = f'ffmpeg -y -i {path} -c:v libx264 -crf {crf} {target_path}/{perc}/{i.name}'
            args = shlex.split(cmd)
            ffprobeOutput = subprocess.check_output(args).decode('utf-8')

In [None]:
create_compressed_dataset('../data/original', '../data/output')

In [14]:
from sklearn.model_selection import train_test_split
target_path = '../data/'
scandir = os.scandir(target_path)
metadata_agg = []
classes = []
for p in scandir:
    if p.is_dir:
        for o in tqdm(os.scandir(p.path), total=1000):
            metadata_agg.append(findVideoMetada(o.path))
            classes.append(p.name)
data = pd.DataFrame(metadata_agg)

1001it [05:45,  2.69it/s]                                                                                              
1001it [05:54,  3.03it/s]                                                                                              
1001it [06:41,  3.03it/s]                                                                                              


In [15]:
u = data.copy()
u['class'] = classes
u.to_csv('video_metadata.csv', index=False)

In [97]:
data.columns

Index(['avg_frame_rate', 'bit_rate', 'bits_per_raw_sample', 'chroma_location',
       'codec_long_name', 'codec_name', 'codec_tag', 'codec_tag_string',
       'codec_time_base', 'codec_type', 'coded_height', 'coded_width',
       'color_primaries', 'color_range', 'color_space', 'color_transfer',
       'display_aspect_ratio', 'disposition', 'duration', 'duration_ts',
       'has_b_frames', 'height', 'index', 'is_avc', 'level', 'nal_length_size',
       'nb_frames', 'pix_fmt', 'profile', 'r_frame_rate', 'refs',
       'sample_aspect_ratio', 'start_pts', 'start_time', 'tags', 'time_base',
       'width'],
      dtype='object')

In [100]:
data.drop(drop, axis=1).nunique()

avg_frame_rate             9
bit_rate                2996
bits_per_raw_sample        1
chroma_location            1
codec_long_name            1
codec_name                 1
codec_tag                  1
codec_tag_string           1
codec_time_base            9
codec_type                 1
coded_height               3
coded_width               18
color_primaries            1
color_range                1
color_space                1
color_transfer             1
display_aspect_ratio       1
duration                 621
duration_ts              464
has_b_frames               2
height                     4
index                      1
is_avc                     1
level                      7
nal_length_size            1
nb_frames                458
pix_fmt                    2
profile                    3
r_frame_rate               9
refs                       1
sample_aspect_ratio        1
start_pts                  1
start_time                 1
time_base                  6
width         

In [103]:
copy = data.copy()

drop = ['disposition', 'tags']
copy = copy.drop(drop, axis=1)

num_cols = ['duration', 'duration_ts', 'nb_frames', 'width', 'bit_rate', 'width', 'height', 'coded_width', 'coded_height', 'start_time']
# convert columns like 'bit_rate' to float dtype
copy.loc[:, num_cols] = copy[num_cols].astype('float')

cat_cols = copy.select_dtypes(['object']).columns
dummies = pd.get_dummies(copy[cat_cols])
print(dummies.columns.tolist())
copy = pd.concat([copy, dummies], axis=1, sort=False)
copy = copy.drop(cat_cols, axis=1)
copy

['avg_frame_rate_15/1', 'avg_frame_rate_18/1', 'avg_frame_rate_24/1', 'avg_frame_rate_24000/1001', 'avg_frame_rate_25/1', 'avg_frame_rate_29/1', 'avg_frame_rate_30/1', 'avg_frame_rate_50/1', 'avg_frame_rate_60/1', 'bits_per_raw_sample_8', 'chroma_location_left', 'codec_long_name_H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10', 'codec_name_h264', 'codec_tag_0x31637661', 'codec_tag_string_avc1', 'codec_time_base_1/100', 'codec_time_base_1/120', 'codec_time_base_1/30', 'codec_time_base_1/36', 'codec_time_base_1/48', 'codec_time_base_1/50', 'codec_time_base_1/58', 'codec_time_base_1/60', 'codec_time_base_1001/48000', 'codec_type_video', 'color_primaries_bt709', 'color_range_tv', 'color_space_bt709', 'color_transfer_bt709', 'display_aspect_ratio_16:9', 'is_avc_true', 'nal_length_size_4', 'pix_fmt_yuv420p', 'pix_fmt_yuv444p', 'profile_High', 'profile_High 4:4:4 Predictive', 'profile_Main', 'r_frame_rate_15/1', 'r_frame_rate_18/1', 'r_frame_rate_24/1', 'r_frame_rate_24000/1001', 'r_frame_rate_25/1

Unnamed: 0,bit_rate,coded_height,coded_width,duration,duration_ts,has_b_frames,height,index,level,nb_frames,...,r_frame_rate_30/1,r_frame_rate_50/1,r_frame_rate_60/1,sample_aspect_ratio_1:1,time_base_1/12288,time_base_1/12800,time_base_1/14848,time_base_1/15360,time_base_1/18432,time_base_1/24000
0,57563.0,480.0,640.0,15.840000,202752.0,2,480.0,0,30,396.0,...,0,0,0,0,0,1,0,0,0,0
1,180891.0,720.0,1280.0,18.400000,235520.0,2,720.0,0,31,460.0,...,0,0,0,0,0,1,0,0,0,0
2,87629.0,720.0,1280.0,23.100000,354816.0,2,720.0,0,31,693.0,...,1,0,0,0,0,0,0,1,0,0
3,41844.0,480.0,640.0,12.120000,155136.0,2,480.0,0,30,303.0,...,0,0,0,0,0,1,0,0,0,0
4,113535.0,720.0,1280.0,12.360000,158208.0,2,720.0,0,31,309.0,...,0,0,0,0,0,1,0,0,0,0
5,156000.0,480.0,720.0,15.400000,197120.0,2,480.0,0,30,385.0,...,0,0,0,0,0,1,0,0,0,0
6,118795.0,720.0,1280.0,10.333333,158720.0,2,720.0,0,31,310.0,...,1,0,0,0,0,0,0,1,0,0
7,50992.0,480.0,608.0,20.200000,258560.0,2,480.0,0,30,505.0,...,0,0,0,0,0,1,0,0,0,0
8,39529.0,480.0,640.0,25.480000,326144.0,2,480.0,0,30,637.0,...,0,0,0,0,0,1,0,0,0,0
9,42526.0,480.0,640.0,19.600000,301056.0,2,480.0,0,30,588.0,...,1,0,0,0,0,0,0,1,0,0


In [104]:
medians = {}
cols = copy.columns.tolist()
# print(cols)
for col in cols:
    median = copy[col].median()
    medians[col] = median
pickle.dump(medians, open('medians.pkl', 'wb'))
medians

372935.0
480.0
864.0
16.666667
235008.0
2.0
480.0
0.0
31.0
459.0
1.0
0.0
0.0
854.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


{'bit_rate': 372935.0,
 'coded_height': 480.0,
 'coded_width': 864.0,
 'duration': 16.666667,
 'duration_ts': 235008.0,
 'has_b_frames': 2.0,
 'height': 480.0,
 'index': 0.0,
 'level': 31.0,
 'nb_frames': 459.0,
 'refs': 1.0,
 'start_pts': 0.0,
 'start_time': 0.0,
 'width': 854.0,
 'avg_frame_rate_15/1': 0.0,
 'avg_frame_rate_18/1': 0.0,
 'avg_frame_rate_24/1': 0.0,
 'avg_frame_rate_24000/1001': 0.0,
 'avg_frame_rate_25/1': 0.0,
 'avg_frame_rate_29/1': 0.0,
 'avg_frame_rate_30/1': 0.0,
 'avg_frame_rate_50/1': 0.0,
 'avg_frame_rate_60/1': 0.0,
 'bits_per_raw_sample_8': 1.0,
 'chroma_location_left': 1.0,
 'codec_long_name_H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10': 1.0,
 'codec_name_h264': 1.0,
 'codec_tag_0x31637661': 1.0,
 'codec_tag_string_avc1': 1.0,
 'codec_time_base_1/100': 0.0,
 'codec_time_base_1/120': 0.0,
 'codec_time_base_1/30': 0.0,
 'codec_time_base_1/36': 0.0,
 'codec_time_base_1/48': 0.0,
 'codec_time_base_1/50': 0.0,
 'codec_time_base_1/58': 0.0,
 'codec_time_base_1/60': 

In [105]:
model = DecisionTreeClassifier()
copy = copy.sort_index(axis=1)
x_train, x_test, y_train, y_test = train_test_split(copy, classes, test_size=0.2, shuffle=True)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.6405990016638935

In [106]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))
pickle.dump(copy.columns.tolist(), open('columns.pkl', 'wb'))

In [82]:
path = '../data/original/000.mp4'
columns = pickle.load(open('columns.pkl', 'rb'))
medians = pickle.load(open('medians.pkl', 'rb'))
df = pd.DataFrame(columns=columns)
metadata_dict = findVideoMetada(path)
metadata = pd.DataFrame([metadata_dict])
for col in columns:
    if col not in metadata.columns.tolist():
        metadata[col] = medians.get(col, 0)
metadata


Empty DataFrame
Columns: [avg_frame_rate_15/1, avg_frame_rate_18/1, avg_frame_rate_24/1, avg_frame_rate_24000/1001, avg_frame_rate_25/1, avg_frame_rate_29/1, avg_frame_rate_30/1, avg_frame_rate_50/1, avg_frame_rate_60/1, bit_rate, bit_rate, bits_per_raw_sample_8, chroma_location_left, codec_long_name_H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10, codec_name_h264, codec_tag_0x31637661, codec_tag_string_avc1, codec_time_base_1/100, codec_time_base_1/120, codec_time_base_1/30, codec_time_base_1/36, codec_time_base_1/48, codec_time_base_1/50, codec_time_base_1/58, codec_time_base_1/60, codec_time_base_1001/48000, codec_type_video, coded_height, coded_height, coded_width, coded_width, color_primaries_bt709, color_range_tv, color_space_bt709, color_transfer_bt709, display_aspect_ratio_16:9, duration, duration, duration_ts, duration_ts, has_b_frames, has_b_frames, height, height, index, index, is_avc_true, level, level, nal_length_size_4, nb_frames, nb_frames, pix_fmt_yuv420p, pix_fmt_yuv444p, pr

Unnamed: 0,avg_frame_rate,bit_rate,bits_per_raw_sample,chroma_location,codec_long_name,codec_name,codec_tag,codec_tag_string,codec_time_base,codec_type,...,r_frame_rate_50/1,r_frame_rate_60/1,sample_aspect_ratio_1:1,start_time_0.000000,time_base_1/12288,time_base_1/12800,time_base_1/14848,time_base_1/15360,time_base_1/18432,time_base_1/24000
0,25/1,402660,8,left,H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10,h264,0x31637661,avc1,1/50,video,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
def classify_video(path):
    drop = ['disposition', 'tags']
    num_cols = ['duration', 'duration_ts', 'nb_frames', 'width', 'bit_rate']
    columns = pickle.load(open('columns.pkl', 'rb'))
    medians = pickle.load(open('medians.pkl', 'rb'))
    
    metadata_dict = findVideoMetada(path)
    metadata = pd.DataFrame([metadata_dict])
    for col in columns:
        if col not in metadata.columns.tolist():
            metadata[col] = medians.get(col, 0)
    
    metadata = metadata.drop(drop, axis=1)
    # convert columns like 'bit_rate' to float dtype
    metadata[num_cols] = metadata[num_cols].astype('float')

    cat_cols = metadata.select_dtypes(['object']).columns
    dummies = pd.get_dummies(metadata[cat_cols])
    metadata[dummies.columns] = dummies
#     copy = pd.concat([copy, dummies], axis=1, sort=False)
    metadata = metadata.drop(cat_cols, axis=1)
    model = pickle.load(open('model.pkl', 'rb'))
    prediction = model.predict(metadata)[0]
    return prediction

classify_video('../data/original/000.mp4')

'0.6'

In [126]:
import sys
sys.path.append('..')
from library import DatabaseIO


In [128]:
a = pd.DataFrame([{1: 1, 2:2}, {1: 3, 2: 4}])
a

Unnamed: 0,1,2
0,1,2
1,3,4


In [142]:
for i in library.config:
    print(i)

DEFAULT
RDS


In [141]:
import configparser

config = configparser.ConfigParser()
config.read('../config.ini')


['../config.ini']

In [144]:
io = DatabaseIO()
io.write_data(a, 'test')

In [9]:
model = pickle.load(open('model.pkl', 'rb'))


In [4]:
data = pd.read_csv('video_metadata.csv')

In [29]:
data['class'].value_counts()

original    1001
0.77        1001
0.6         1001
Name: class, dtype: int64

In [22]:
DROP_COLS = compression_detection.DROP_COLS
NUM_COLS = compression_detection.NUM_COLS


def train_classifier(data=None, X=None, Y=None, path=None, save=False):
    if path is not None:
        copy, classes = aggregate_metadata(path)
    elif X is not None and Y is not None:
        copy = X
        classes = Y
    else:
        copy = data.drop(['class'], axis=1)
        classes = data['class']

    copy = copy.drop(DROP_COLS, axis=1)

    # convert columns like 'bit_rate' to float dtype
    copy.loc[:, NUM_COLS] = copy[NUM_COLS].astype('float')

    print(copy.columns)
    copy = copy._get_numeric_data()
    print(copy.columns)

    model = DecisionTreeClassifier(max_depth=5)
    copy = copy.sort_index(axis=1)
    bool_cols = copy.select_dtypes(include=['bool'])
    copy[bool_cols] = copy[bool_cols].astype('int')
    x_train, x_test, y_train, y_test = train_test_split(copy, classes, test_size=0.2, shuffle=True)
    model.fit(x_train, y_train)

    if save:
        pickle.dump(model, open('model.pkl', 'wb'))
        pickle.dump(copy.columns.tolist(), open('columns.pkl', 'wb'))

    score = model.score(x_test, y_test)
    print(f'Model saved. Model score: {score}')
    print(f'Nr of columns: {copy.columns}')
    return copy.head()

In [23]:
train_classifier(data=data, save=False)

Index(['avg_frame_rate', 'bit_rate', 'bits_per_raw_sample', 'chroma_location',
       'codec_long_name', 'codec_name', 'codec_tag', 'codec_tag_string',
       'codec_time_base', 'codec_type', 'coded_height', 'coded_width',
       'color_primaries', 'color_range', 'color_space', 'color_transfer',
       'display_aspect_ratio', 'duration', 'duration_ts', 'has_b_frames',
       'height', 'index', 'is_avc', 'level', 'nal_length_size', 'nb_frames',
       'pix_fmt', 'profile', 'r_frame_rate', 'refs', 'sample_aspect_ratio',
       'start_pts', 'start_time', 'time_base', 'width'],
      dtype='object')
Index(['bit_rate', 'bits_per_raw_sample', 'coded_height', 'coded_width',
       'duration', 'duration_ts', 'has_b_frames', 'height', 'index', 'is_avc',
       'level', 'nal_length_size', 'nb_frames', 'refs', 'start_pts',
       'start_time', 'width'],
      dtype='object')
Model saved. Model score: 0.6622296173044925
Nr of columns: Index(['bit_rate', 'bits_per_raw_sample', 'coded_height', 'code

Unnamed: 0,bit_rate,bits_per_raw_sample,coded_height,coded_width,duration,duration_ts,has_b_frames,height,index,is_avc,level,nal_length_size,nb_frames,refs,start_pts,start_time,width
0,57563.0,8.0,480.0,640.0,15.84,202752.0,2,480.0,0,1.0,30,4.0,396.0,1,0,0.0,640.0
1,180891.0,8.0,720.0,1280.0,18.4,235520.0,2,720.0,0,1.0,31,4.0,460.0,1,0,0.0,1280.0
2,87629.0,8.0,720.0,1280.0,23.1,354816.0,2,720.0,0,1.0,31,4.0,693.0,1,0,0.0,1280.0
3,41844.0,8.0,480.0,640.0,12.12,155136.0,2,480.0,0,1.0,30,4.0,303.0,1,0,0.0,640.0
4,113535.0,8.0,720.0,1280.0,12.36,158208.0,2,720.0,0,1.0,31,4.0,309.0,1,0,0.0,1280.0


In [26]:
data.select_dtypes(include=['bool']).astype('int')

Unnamed: 0,is_avc
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [11]:
import compression_detection
# from compression_detection import classify_video

In [17]:
compression_detection.COMPRESSION_DIR = '.'
compression_detection.classify_video('../data/original/000.mp4')

{'index': 0, 'codec_name': 'h264', 'codec_long_name': 'H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10', 'profile': 'High 4:4:4 Predictive', 'codec_type': 'video', 'codec_time_base': '1/50', 'codec_tag_string': 'avc1', 'codec_tag': '0x31637661', 'width': 640, 'height': 480, 'coded_width': 640, 'coded_height': 480, 'has_b_frames': 2, 'pix_fmt': 'yuv444p', 'level': 30, 'chroma_location': 'left', 'refs': 1, 'is_avc': 'true', 'nal_length_size': '4', 'r_frame_rate': '25/1', 'avg_frame_rate': '25/1', 'time_base': '1/12800', 'start_pts': 0, 'start_time': '0.000000', 'duration_ts': 202752, 'duration': '15.840000', 'bit_rate': '402660', 'bits_per_raw_sample': '8', 'nb_frames': '396', 'disposition': {'default': 1, 'dub': 0, 'original': 0, 'comment': 0, 'lyrics': 0, 'karaoke': 0, 'forced': 0, 'hearing_impaired': 0, 'visual_impaired': 0, 'clean_effects': 0, 'attached_pic': 0, 'timed_thumbnails': 0}, 'tags': {'language': 'und', 'handler_name': 'VideoHandler'}}


ValueError: could not convert string to float: 'true'

In [27]:
df = pd.DataFrame(columns=['hash', 'link', 'fake'])

In [34]:
dbio = library.DatabaseIO()

In [36]:
dbio.write_data(df, 'history')

In [39]:
df = df.append({'hash': 'testhash', 'link': 'https://www.youtube.com/watch?v=668nUCeBHyY', 'fake': False}, ignore_index=True)

In [41]:
if 'https://www.youtube.com/watch?v=668nUCeBHyY' in df.link.values:
    print('yes')

yes


In [47]:
youtube_url = 'https://www.youtube.com/watch?v=668nUCeBHyY'
history = df
fake = history.loc[history['link'] == youtube_url, 'fake'][0]

In [48]:
fake

False