# Script to Combine Video Lists generated with the [YouTube Data Tools](https://tools.digitalmethods.net/netvizz/youtube/index.php) 
## Usage

1. go to the Variables Section below and Set/Check the desired Values. make sure the relative Paths are correct.
2. Activate / Deactivate the desiredi Features
3. Run the script, read output to find issues

## Features

- Combines Video Lists from  [YouTube Data Tools](https://tools.digitalmethods.net/netvizz/youtube/index.php) into one Master List
- Removes Duplicate Video Entrys
- Can Mark Downloaded Video Comments based on downloaded comment files (if video Id is included)
- Can Mark Downloaded Video Comments based on previously created and marked master Video List
- Can Split



Import Stuff

In [13]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

set Variables and options

### Variables:

In [14]:
min_Comments = 3.0

split_comment_count = 10000.0

path = os.path.abspath("../social_media_youtube_analysis_project")
data_raw_path = path + "/data_raw/videolists" # raw video lists
video_list_save_path = path + "/summery_vid_lists"
comment_saves_path = path + "/data_raw/comments"
comment_manual_saves_path = path + "/data_raw/datatool_manual"

print("Check Path in case of Error " + path)

Check Path in case of Error C:\Users\moritz\Downloads\social analizing\project\git\social_media_youtube_analysis_project


### Features

In [15]:
mark_videos_from_old_list = True
use_only_newest_video_master_list = False


mark_videos_from_comment_files = True


options:

In [16]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

Erstelle Liste der Videolisten und checken der Pfade

In [17]:
if not os.path.exists(path):
    raise ValueError("Error, Check Path in 'path' variable")
if not os.path.exists(data_raw_path):
    raise ValueError("Error, Check Path in 'data_raw_path' variable")
if not os.path.exists(video_list_save_path):
    raise ValueError("Error, Check Path in 'video_list_save_path' variable")
if not os.path.exists(video_list_save_path) and mark_videos_from_comment_files:
    raise ValueError("Error, Check Path in 'video_list_save_path' variable")

files = [f for f in os.listdir(data_raw_path) if os.path.isfile(os.path.join( data_raw_path , f)) and "videolist" in f]

if not files:
    raise ValueError('Error List is empty, no video Lists have been found')

combine files and insert columns for marking

In [18]:
# combined_csv = pd.concat([pd.read_csv(data_raw_path +"/" + f, sep='\t',header=(0)) for f in files ])
def searchtermFromFile(filename):
    name = filename.split("_")
    name = name[7:]
    name = " ".join(name)
    name = name[:-4]
    return name

firstfile = files.pop()
combined_csv = pd.read_csv(data_raw_path +"/" + firstfile, sep='\t',header=(0))
combined_csv['search_Term'] = searchtermFromFile(firstfile)
combined_csv['Comments_Downloaded'] = "False"
combined_csv['Comments_Downloaded_Stopped_At'] = 0

for filename in files:
    filepath = data_raw_path +"/" + filename
    add_csv = pd.read_csv(data_raw_path +"/" + filename, sep='\t',header=(0))
    add_csv['search_Term'] = searchtermFromFile(filename)
    add_csv['Comments_Downloaded'] = "False"
    add_csv['Comments_Downloaded_Stopped_At'] = 0
    combined_csv = pd.concat([combined_csv, add_csv])
    

drop duplicate videos

In [19]:
combined_csv.drop_duplicates(subset ="videoId", 
                     keep = False, inplace = True) 

drop videos below comment minimum

In [20]:
combined_csv.drop(combined_csv[combined_csv.commentCount < min_Comments].index, inplace=True)

funktion to Export as CSV with timestamp

In [21]:
def export(export_csf):
    # datetime object containing current date and time
    now = datetime.now()
 
    # time Format
    # YY-mm-dd-H-M-S
    dt_string = now.strftime("%Y-%m-%d-%H-%M-%S")

    # in case path does not exist
    if not os.path.exists(path + "/summery_vid_lists/"):
        os.makedirs(path + "/summery_vid_lists/")

    # export complete list
    export_csf.to_csv( video_list_save_path + "/" + dt_string + "_master_video_list_all.csv", index=False, encoding='utf-8-sig', sep='\t')
    print("Complete List Exported to " + video_list_save_path + "/" + dt_string + "_master_video_list_all.csv")
    
    # split by comment value
    export_csf.sort_values(by=['commentCount'],inplace=True, axis=0,ignore_index = True,  na_position = 'first')
    csv = export_csf

    index = csv[csv.commentCount >=split_comment_count].first_valid_index()
  
    df1 = csv[:index]
    df2 = csv[index:]
    
    
    df1.to_csv( video_list_save_path + "/" + dt_string + "_master_video_list_below_" + str(int(split_comment_count)) + ".csv", index=False, encoding='utf-8-sig', sep='\t')
    print("Complete List Exported to " + video_list_save_path + "/" + dt_string + "_master_video_list_below_" + str(int(split_comment_count)) + ".csv")
    
    df2.to_csv( video_list_save_path + "/" + dt_string + "_master_video_list_above_" + str(int(split_comment_count)) + ".csv", index=False, encoding='utf-8-sig', sep='\t')
    print("Complete List Exported to " + video_list_save_path + "/" + dt_string + "_master_video_list_above_" + str(int(split_comment_count)) + ".csv")

Funktion to mark videos from old list

In [22]:
def mark_from_videos():
    # function requires filles combined_csv variable
    files = [f for f in os.listdir(video_list_save_path) if os.path.isfile(os.path.join( video_list_save_path , f)) and "master_video_list_all" in f]
    files = sorted(files)
    print(files)
    if use_only_newest_video_master_list:
        files = [files[-1]]
    for filename in files:
        filepath =  video_list_save_path +"/" + filename
        csv = pd.read_csv(filepath,header=(0))

        if 'Comments_Downloaded' in csv.columns:
            csv = csv[csv.Comments_Downloaded == True]
            for index, row in csv.iterrows():
                combined_csv.loc[combined_csv.videoId == row.videoId, 'Comments_Downloaded'] = True
    print("mark_from_videos finished")
    print(" ")

Funktion to mark videos from comments

In [23]:
def mark_from_comments():
    files = [f for f in os.listdir(comment_saves_path) if os.path.isfile(os.path.join( comment_saves_path , f)) and "comment" in f]
    for filename in files:
        filepath =  comment_saves_path +"/" + filename
        try:
            csv = pd.read_csv(filepath,header=(0))
            if 'video_id' in csv.columns:
                csv = csv.drop_duplicates('video_id', keep='last')
                for index, row in csv.iterrows():
                    combined_csv.loc[combined_csv.videoId == row.video_id, 'Comments_Downloaded'] = True
            else:
                print("File " + filename + " has no column video_id, videos not marked in list, column names are:")
                print(" ")
                print(list(csv.columns) )
        except UnicodeDecodeError as e:
            print("Unicode Error in file " + filename + " Re-Save in UTF 8, could be an unprocessed manual download") 
    print("mark_from_comments finished")
    print(" ")
    
def mark_from_Manual_comments():
    # manuel video list from comments folder
    files = [f for f in os.listdir(comment_saves_path) if os.path.isfile(os.path.join( comment_saves_path , f)) and "comment" in f and "videoinfo" in f]
    
    # manuel video list from data tools folder
    files = files + [f for f in os.listdir(comment_manual_saves_path) if os.path.isfile(os.path.join( comment_manual_saves_path , f)) and "comment" in f and "videoinfo" in f]
    
    # remove double entrys
    files = list(dict.fromkeys(files))
       
    for filename in files:
        # extract video id
        file =  filename[10:]
        videoId = file[:file.find("_202")]
        
        # mark video
        combined_csv.loc[combined_csv.videoId == videoId, 'Comments_Downloaded'] = True
     
    print("mark_from_comments_manual finished")
    print(" ")

Apply features and export

In [24]:
if mark_videos_from_old_list:
    mark_from_videos()

if mark_videos_from_comment_files:
    mark_from_comments()
    mark_from_Manual_comments()
    
export(combined_csv)

ParserError: Error tokenizing data. C error: Expected 16 fields in line 3, saw 83
