In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Membaca file CSV
file_path = "holoclips_data.csv"  # Ganti dengan path file Anda
data = pd.read_csv(file_path)

# Menampilkan 5 data pertama
data.head()


Unnamed: 0,title,duration,author,video_link,thumbnail
0,Are Roberu NightMea burned out? What's to come...,02:15,Oboretai Writing ch.,https://www.youtube.com/watch?v=4D8dQQrPV88,https://i.ytimg.com/vi_webp/4D8dQQrPV88/mqdefa...
1,[ENG SUB/Hololive] Biboo is planning to spread...,01:11,The Arrow🎥 草 Clipper,https://www.youtube.com/watch?v=KEyTcjOmGVE,https://i.ytimg.com/vi_webp/KEyTcjOmGVE/mqdefa...
2,Watame & Fauna Adorably Fight Their Language B...,02:40,Kiriku Translation,https://www.youtube.com/watch?v=laiKuMndc4A,https://i.ytimg.com/vi_webp/laiKuMndc4A/mqdefa...
3,Kiara Is Down Bad For Stellar Blade,06:04,VP Ch.,https://www.youtube.com/watch?v=TIjyU362SM8,https://i.ytimg.com/vi_webp/TIjyU362SM8/mqdefa...
4,Watame Keeps Saying FLAT and Made Fauna Misund...,01:07,Sashimi Clips,https://www.youtube.com/watch?v=WS4B7VWyD_A,https://i.ytimg.com/vi_webp/WS4B7VWyD_A/mqdefa...


In [2]:
# Informasi umum dataset
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97351 entries, 0 to 97350
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       97333 non-null  object
 1   duration    97350 non-null  object
 2   author      97350 non-null  object
 3   video_link  97351 non-null  object
 4   thumbnail   97351 non-null  object
dtypes: object(5)
memory usage: 3.7+ MB


In [3]:
# Statistik deskriptif
data.describe(include="all")


Unnamed: 0,title,duration,author,video_link,thumbnail
count,97333,97350,97350,97351,97351
unique,97198,2113,466,97327,97327
top,The BEST Hololive EN Moments From The Last Wee...,00:31,Vtube Tengoku,https://www.youtube.com/watch?v=aZTCF8-j3D4,https://i.ytimg.com/vi_webp/aZTCF8-j3D4/mqdefa...
freq,9,951,3829,2,2


In [4]:
# Fungsi untuk mengonversi durasi ke detik
def duration_to_seconds(duration):
    if pd.isna(duration):  # Tangani nilai NaN
        return None
    try:
        duration = str(duration)  # Pastikan nilai berupa string
        parts = duration.split(':')
        if len(parts) == 2:  # Format MM:SS
            minutes = int(parts[0])
            seconds = int(parts[1])
            return minutes * 60 + seconds
        elif len(parts) == 3:  # Format HH:MM:SS
            hours = int(parts[0])
            minutes = int(parts[1])
            seconds = int(parts[2])
            return hours * 3600 + minutes * 60 + seconds
    except ValueError:
        return None


In [5]:
# Pastikan kolom durasi sudah dikonversi ke detik
data["duration_seconds"] = data["duration"].apply(duration_to_seconds)

# Cek durasi maksimum dan minimum dalam detik
max_duration_sec = data["duration_seconds"].max()
min_duration_sec = data["duration_seconds"].min()

print(f"Durasi maksimum dalam detik: {max_duration_sec} detik")
print(f"Durasi minimum dalam detik: {min_duration_sec} detik")


Durasi maksimum dalam detik: 86399.0 detik
Durasi minimum dalam detik: 0.0 detik


In [6]:
# Konversi durasi ke dalam detik
data["duration_seconds"] = data["duration"].apply(duration_to_seconds)

# Menampilkan 5 data pertama setelah konversi
data.head()

Unnamed: 0,title,duration,author,video_link,thumbnail,duration_seconds
0,Are Roberu NightMea burned out? What's to come...,02:15,Oboretai Writing ch.,https://www.youtube.com/watch?v=4D8dQQrPV88,https://i.ytimg.com/vi_webp/4D8dQQrPV88/mqdefa...,135.0
1,[ENG SUB/Hololive] Biboo is planning to spread...,01:11,The Arrow🎥 草 Clipper,https://www.youtube.com/watch?v=KEyTcjOmGVE,https://i.ytimg.com/vi_webp/KEyTcjOmGVE/mqdefa...,71.0
2,Watame & Fauna Adorably Fight Their Language B...,02:40,Kiriku Translation,https://www.youtube.com/watch?v=laiKuMndc4A,https://i.ytimg.com/vi_webp/laiKuMndc4A/mqdefa...,160.0
3,Kiara Is Down Bad For Stellar Blade,06:04,VP Ch.,https://www.youtube.com/watch?v=TIjyU362SM8,https://i.ytimg.com/vi_webp/TIjyU362SM8/mqdefa...,364.0
4,Watame Keeps Saying FLAT and Made Fauna Misund...,01:07,Sashimi Clips,https://www.youtube.com/watch?v=WS4B7VWyD_A,https://i.ytimg.com/vi_webp/WS4B7VWyD_A/mqdefa...,67.0


In [7]:
# Cek statistik durasi
data["duration_seconds"].describe()


count    97350.000000
mean       201.332234
std       1356.908806
min          0.000000
25%         59.000000
50%        100.000000
75%        183.000000
max      86399.000000
Name: duration_seconds, dtype: float64

In [8]:
# Filter data berdasarkan durasi (hanya yang antara 10 detik dan 210 detik)
data_cleaned = data[(data["duration_seconds"] >= 10) & (data["duration_seconds"] <= 140)]

# Hapus kolom 'duration_seconds' setelah filtering
data_cleaned = data_cleaned.drop(columns=["duration_seconds"])

# Simpan data yang sudah dibersihkan ke file baru
cleaned_file_path = "data_cleaned_140_holoclip.csv"
data_cleaned.to_csv(cleaned_file_path, index=False)

print(f"Data berhasil dibersihkan dan disimpan ke '{cleaned_file_path}'")


Data berhasil dibersihkan dan disimpan ke 'data_cleaned_140_holoclip.csv'
