1. Import Libraries
**bold text**

In [88]:
import pandas as pd

In [89]:
# Langkah 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**2. Baca Dataset Youtube Statistic yang kotor**

In [90]:
# Load dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/Global_YouTube_Statistics_Messy.csv"
df = pd.read_csv(file_path)


In [91]:
# Melihat 11 data Kotor
print("Messy Data:")
print(df.head())
print("\nInfo Messy Data:")
print(df.info())

Messy Data:
   rank                    youtuber  subscribers   video_views  \
0     1                    T-Series    245000000  228000000000   
1     2              YouTube Movies    170000000             0   
2     3                     MrBeast    166000000   28368841870   
3     4  Cocomelon - Nursery Rhymes    162000000  164000000000   
4     5                   SET India    159000000  148000000000   

           category                       title  uploads        country  \
0             Music                    T-Series    20082          India   
1  Film & Animation               youtubemovies        1  United States   
2     Entertainment                     MrBeast      741  United States   
3         Education  Cocomelon - Nursery Rhymes      966  United States   
4             Shows                   SET India   116536          India   

  abbreviation   channel_type  ...  subscribers_for_last_30_days  \
0           IN          Music  ...                       2000000   
1   

**3. Menghapus Column yang tidak digunakan**

In [92]:
print(df.columns.tolist())


['rank', 'youtuber', 'subscribers', 'video_views', 'category', 'title', 'uploads', 'country', 'abbreviation', 'channel_type', 'video_views_rank', 'country_rank', 'channel_type_rank', 'video_views_for_the_last_30_days', 'lowest_monthly_earnings', 'highest_monthly_earnings', 'lowest_yearly_earnings', 'highest_yearly_earnings', 'subscribers_for_last_30_days', 'created_year', 'created_month', 'created_date', 'gross_tertiary_education_enrollment_(%)', 'population', 'unemployment_rate', 'urban_population', 'latitude', 'longitude']


In [93]:
df = df.drop(columns=["latitude","urban_population","unemployment_rate","population","gross_tertiary_education_enrollment_(%)","created_date","created_month","created_year","subscribers_for_last_30_days","highest_yearly_earnings","lowest_yearly_earnings","highest_monthly_earnings","lowest_monthly_earnings","video_views_for_the_last_30_days","channel_type_rank","country_rank"])

# Simpan kembali ke CSV tanpa index
df.to_csv("data_baru.csv", index=False)

In [94]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   rank              995 non-null    int64  
 1   youtuber          989 non-null    object 
 2   subscribers       995 non-null    int64  
 3   video_views       995 non-null    int64  
 4   category          995 non-null    object 
 5   title             990 non-null    object 
 6   uploads           995 non-null    int64  
 7   country           995 non-null    object 
 8   abbreviation      995 non-null    object 
 9   channel_type      995 non-null    object 
 10  video_views_rank  995 non-null    int64  
 11  longitude         995 non-null    float64
dtypes: float64(1), int64(5), object(6)
memory usage: 93.4+ KB
None


4. Menampilkan struktur variabel dataset






In [95]:
column_types = df.dtypes
print(column_types)

rank                  int64
youtuber             object
subscribers           int64
video_views           int64
category             object
title                object
uploads               int64
country              object
abbreviation         object
channel_type         object
video_views_rank      int64
longitude           float64
dtype: object


5. Data Cleansing Column Youtuber

Menyamaratakan format yang ada, jika ada karakter atau font yang tidak sesuai maka akan diubah, dan jika ada spasi berlebihan

In [96]:
import pandas as pd
import re
import unicodedata

# Baca file CSV
file_path = "/content/drive/MyDrive/Colab Notebooks/Global_YouTube_Statistics_Messy.csv"
df = pd.read_csv(file_path)

def clean_name(name):
    if pd.isnull(name):
        return ""

    # Normalisasi karakter (font aneh → standar)
    name = unicodedata.normalize("NFKD", str(name))

    # Rapikan spasi
    name = re.sub(r'\s+', ' ', name).strip()

    return name

# Bersihkan kolom youtuber
df['Nama Youtuber Bersih'] = df['youtuber'].apply(clean_name)

# Hapus baris yang NaN atau kosong ("")
df = df.dropna(subset=['youtuber', 'Nama Youtuber Bersih'])
df = df[df['Nama Youtuber Bersih'] != ""]

# Urutkan berdasarkan nama bersih (A-Z)
df_sorted = df.sort_values(by='Nama Youtuber Bersih', ascending=True).reset_index(drop=True)

# Lihat hasil
print(df_sorted[['youtuber', 'Nama Youtuber Bersih']].head(20))


                         youtuber           Nama Youtuber Bersih
0                            #Ref                           #Ref
1                 #Refugio Mental                #Refugio Mental
2                      - Al-Remas                     - Al-Remas
3         - Genevieve's Playhouse        - Genevieve's Playhouse
4                           /Atro                          /Atro
5                 123 GO! Spanish                123 GO! Spanish
6           1MILLION Dance Studio          1MILLION Dance Studio
7                         1theK (                        1theK (
8                               5                              5
9                 5-Minute Crafts                5-Minute Crafts
10            5-Minute Crafts DIY            5-Minute Crafts DIY
11         5-Minute Crafts FAMILY         5-Minute Crafts FAMILY
12           5-Minute Crafts PLAY           5-Minute Crafts PLAY
13        5-Minute Crafts Recycle        5-Minute Crafts Recycle
14                       

6. Data Cleansing Column Subscriber

Memberikan format ribu pada subsribers yang diatas ratusan, dengan memberi tanda koma pada kelipatan seribu

In [97]:
# Pastikan kolom subscriber berupa angka
df['subscribers'] = pd.to_numeric(df['subscribers'], errors='coerce')

# Format pakai koma pemisah ribuan
df['subscribers_formatted'] = df['subscribers'].apply(
    lambda x: "{:,}".format(int(x)) if pd.notnull(x) else ""
)

# Urutkan berdasarkan subscribers (descending: terbesar -> terkecil)
df_sorted = df.sort_values(by='subscribers', ascending=False).reset_index(drop=True)

# Cek hasil
print(df_sorted[['subscribers', 'subscribers_formatted']].head(20))


    subscribers subscribers_formatted
0     245000000           245,000,000
1     170000000           170,000,000
2     166000000           166,000,000
3     162000000           162,000,000
4     159000000           159,000,000
5     119000000           119,000,000
6     112000000           112,000,000
7     111000000           111,000,000
8     106000000           106,000,000
9      98900000            98,900,000
10     96700000            96,700,000
11     96000000            96,000,000
12     93600000            93,600,000
13     89800000            89,800,000
14     86900000            86,900,000
15     83000000            83,000,000
16     80100000            80,100,000
17     75600000            75,600,000
18     75000000            75,000,000
19     71600000            71,600,000


7. Data Cleansing Column Video Views

Memberikan format ribu pada subsribers yang diatas ratusan, dengan memberi tanda koma pada kelipatan seribu

In [98]:
# Pastikan kolom video_views berupa angka
df['video_views'] = pd.to_numeric(df['video_views'], errors='coerce')

# Buang baris dengan nilai 0 atau NaN
df = df[df['video_views'] > 0]

# Format pakai koma pemisah ribuan
df['video_views_formatted'] = df['video_views'].apply(
    lambda x: "{:,}".format(int(x)) if pd.notnull(x) else ""
)

# Urutkan berdasarkan video_views (descending: terbesar -> terkecil)
df_sorted = df.sort_values(by='video_views', ascending=False).reset_index(drop=True)

# Cek hasil
print(df_sorted[['video_views', 'video_views_formatted']].head(20))


     video_views video_views_formatted
0   228000000000       228,000,000,000
1   164000000000       164,000,000,000
2   148000000000       148,000,000,000
3   101000000000       101,000,000,000
4    93247040539        93,247,040,539
5    90479060027        90,479,060,027
6    77428473662        77,428,473,662
7    77180169894        77,180,169,894
8    73139054467        73,139,054,467
9    61510906457        61,510,906,457
10   59316472754        59,316,472,754
11   57856289381        57,856,289,381
12   57271630846        57,271,630,846
13   56106087508        56,106,087,508
14   55299840198        55,299,840,198
15   50292540392        50,292,540,392
16   47005053156        47,005,053,156
17   45757850229        45,757,850,229
18   44900897958        44,900,897,958
19   41139050371        41,139,050,371


8. Data Cleansing Column Category

Memperbaiki dan membersihkan jika ada karakter yang tidak kapital, jika ada spasi berlebih

In [99]:
# Simpan kolom asli
df['category_raw'] = df['category']

# Rapikan category: hapus spasi berlebih & ubah ke Title Case
df['category_clean'] = (
    df['category_raw']
    .astype(str)                                 # pastikan string
    .str.replace(r'\s+', ' ', regex=True)        # rapikan spasi
    .str.strip()                                 # buang spasi depan/belakang
    .str.title()                                 # kapital di awal tiap kata
)

# Hapus kategori 'Unknown' atau kosong
df = df[~df['category_clean'].str.lower().isin(['unknown', ''])]

# Urutkan berdasarkan category_clean (A-Z)
df_sorted = df.sort_values(by='category_clean', ascending=True).reset_index(drop=True)

# Lihat perbandingan hasil
print(df_sorted[['category_raw', 'category_clean']].head(20))


        category_raw    category_clean
0   Autos & Vehicles  Autos & Vehicles
1   Autos & Vehicles  Autos & Vehicles
2             Comedy            Comedy
3             Comedy            Comedy
4             Comedy            Comedy
5             Comedy            Comedy
6             Comedy            Comedy
7             Comedy            Comedy
8             Comedy            Comedy
9             Comedy            Comedy
10            Comedy            Comedy
11            Comedy            Comedy
12            Comedy            Comedy
13            Comedy            Comedy
14            Comedy            Comedy
15            Comedy            Comedy
16            Comedy            Comedy
17            Comedy            Comedy
18            Comedy            Comedy
19            Comedy            Comedy


9. Data Cleansing Column Title

Memperbaiki dan membersihkan jika ada karakter yang tidak kapital, jika ada spasi berlebih

In [100]:
# Simpan kolom asli
df['title_raw'] = df['title']

# Rapikan title: hapus spasi berlebih & ubah ke Title Case
df['title_clean'] = (
    df['title_raw']
    .astype(str)                                 # pastikan string
    .str.replace(r'\s+', ' ', regex=True)        # rapikan spasi
    .str.strip()                                 # buang spasi depan/belakang
    .str.title()                                 # kapital di awal tiap kata
)

# Hapus title 'Unknown' atau kosong
df = df[~df['title_clean'].str.lower().isin(['unknown', ''])]

# Urutkan berdasarkan title_clean (A-Z)
df_sorted = df.sort_values(by='title_clean', ascending=True).reset_index(drop=True)

# Lihat perbandingan hasil
print(df_sorted[['title_raw', 'title_clean']].head(20))


                   title_raw              title_clean
0            #Refugio Mental          #Refugio Mental
1            #Refugio Mental          #Refugio Mental
2                 - Al-Remas               - Al-Remas
3    - Genevieve's Playhouse  - Genevieve'S Playhouse
4                      /Atro                    /Atro
5            123 GO! Spanish          123 Go! Spanish
6      1MILLION Dance Studio    1Million Dance Studio
7                    1theK (                  1Thek (
8                  21 Savage                21 Savage
9                         24                       24
10                         5                        5
11       5-Minute Crafts 2.0      5-Minute Crafts 2.0
12       5-Minute Crafts DIY      5-Minute Crafts Diy
13    5-Minute Crafts FAMILY   5-Minute Crafts Family
14      5-Minute Crafts PLAY     5-Minute Crafts Play
15   5-Minute Crafts Recycle  5-Minute Crafts Recycle
16                   50 Cent                  50 Cent
17                 7 Minutoz

10. Data Cleansing Column Upload

Memberikan format ribu pada subsribers yang diatas ratusan, dengan memberi tanda koma pada kelipatan seribu

In [101]:
# Pastikan kolom uploads berupa angka
df['uploads'] = pd.to_numeric(df['uploads'], errors='coerce')

# Hapus baris dengan uploads 0 atau NaN
df = df[df['uploads'] > 0]

# Format pakai koma pemisah ribuan
df['uploads_formatted'] = df['uploads'].apply(lambda x: "{:,}".format(int(x)) if pd.notnull(x) else "")

# Urutkan berdasarkan uploads (descending)
df_sorted = df.sort_values(by='uploads', ascending=False).reset_index(drop=True)

# Cek hasil
print(df[['uploads', 'uploads_formatted']].head(20))



    uploads uploads_formatted
0     20082            20,082
2       741               741
3       966               966
4    116536           116,536
6      1111             1,111
7      4716             4,716
8       493               493
9       574               574
10     8548             8,548
11    70127            70,127
13      543               543
14        1                 1
15    71270            71,270
16        1                 1
17     2281             2,281
19      249               249
20     1337             1,337
21   129204           129,204
22     2865             2,865
23     2572             2,572


11. Data Cleansing Column Country

Memperbaiki dan membersihkan jika ada karakter yang tidak kapital, jika ada spasi berlebih

In [102]:
# Simpan kolom asli
df['country_raw'] = df['country']

# Rapikan country: hapus spasi berlebih & ubah ke Title Case
df['country_clean'] = (
    df['country_raw']
    .astype(str)                                 # pastikan string
    .str.replace(r'\s+', ' ', regex=True)        # rapikan spasi
    .str.strip()                                 # buang spasi depan/belakang
    .str.title()                                 # kapital di awal tiap kata
)

# Hapus country 'Unknown' atau kosong
df = df[~df['country_clean'].str.lower().isin(['unknown', ''])]

# Urutkan berdasarkan country_clean (A-Z)
df_sorted = df.sort_values(by='country_clean', ascending=True).reset_index(drop=True)

# Lihat perbandingan hasil
print(df_sorted[['country_raw', 'country_clean']].head(20))


   country_raw country_clean
0      Andorra       Andorra
1    Argentina     Argentina
2    Argentina     Argentina
3    Argentina     Argentina
4    Argentina     Argentina
5    Argentina     Argentina
6    Argentina     Argentina
7    Argentina     Argentina
8    Argentina     Argentina
9    Argentina     Argentina
10   Argentina     Argentina
11   Argentina     Argentina
12   Argentina     Argentina
13   Argentina     Argentina
14   Australia     Australia
15   Australia     Australia
16   Australia     Australia
17   Australia     Australia
18   Australia     Australia
19   Australia     Australia


12. Data Cleansing Column Channel Type

Memperbaiki dan membersihkan jika ada karakter yang tidak kapital, jika ada spasi berlebih

In [103]:
# Simpan kolom asli
df['channel_type_raw'] = df['channel_type']

# Rapikan channel_type: hapus spasi berlebih & ubah ke Title Case
df['channel_type_clean'] = (
    df['channel_type_raw']
    .astype(str)                                 # pastikan string
    .str.replace(r'\s+', ' ', regex=True)        # rapikan spasi
    .str.strip()                                 # buang spasi depan/belakang
    .str.title()                                 # kapital di awal tiap kata
)

# Hapus channel_type 'Unknown' atau kosong
df = df[~df['channel_type_clean'].str.lower().isin(['unknown', ''])]

# Urutkan berdasarkan channel_type_clean (A-Z)
df_sorted = df.sort_values(by='channel_type_clean', ascending=True).reset_index(drop=True)

# Lihat perbandingan hasil
print(df_sorted[['channel_type_raw', 'channel_type_clean']].head(20))


   channel_type_raw channel_type_clean
0           Animals            Animals
1           Animals            Animals
2           Animals            Animals
3             Autos              Autos
4             Autos              Autos
5            Comedy             Comedy
6            Comedy             Comedy
7            Comedy             Comedy
8            Comedy             Comedy
9            Comedy             Comedy
10           Comedy             Comedy
11           Comedy             Comedy
12           Comedy             Comedy
13           Comedy             Comedy
14           Comedy             Comedy
15           Comedy             Comedy
16           Comedy             Comedy
17           Comedy             Comedy
18           Comedy             Comedy
19           Comedy             Comedy


13. Data Cleansing Column Video Views Rank

In [104]:
# Pastikan kolom subscriber berupa angka
df['video_views_rank'] = pd.to_numeric(df['video_views_rank'], errors='coerce')

# Format pakai koma pemisah ribuan
df['video_views_rank_formatted'] = df['video_views_rank'].apply(lambda x: "{:,}".format(int(x)) if pd.notnull(x) else "")
# Urutkan berdasarkan video_views (descending: terbesar -> terkecil)
df_sorted = df.sort_values(by='video_views_rank', ascending=False).reset_index(drop=True)
# Cek hasil
print(df[['video_views_rank', 'video_views_rank_formatted']].head(20))

    video_views_rank video_views_rank_formatted
0                  1                          1
2                 48                         48
3                  2                          2
4                  3                          3
6                  5                          5
7                 44                         44
8                630                        630
9                  8                          8
10                12                         12
11                 7                          7
13                32                         32
15                 4                          4
16           4057901                  4,057,901
17               112                        112
19                38                         38
20                46                         46
21                 9                          9
22                23                         23
23                25                         25
24                18                    

14. Duplikasi Data

Menghapus jika terdapat data atau row yang sama

In [105]:
import pandas as pd
from IPython.display import display

# Pilih hanya kolom hasil cleansing
df_final = df_clean[[
    'rank',
    'youtuber',
    'subscribers_formatted',   # hasil cleansing subscribers
    'video_views_formatted',   # hasil cleansing views
    'category_clean',
    'title_clean',
    'uploads_formatted',
    'country_clean',
    'channel_type_clean'
]]

# Lihat hasilnya
print("Jumlah data setelah cleansing:", len(df_final))
display(df_final.head(20))


Jumlah data setelah cleansing: 812


Unnamed: 0,rank,youtuber,subscribers_formatted,video_views_formatted,category_clean,title_clean,uploads_formatted,country_clean,channel_type_clean
0,1,T-Series,245000000,228000000000,Music,T-Series,20082,India,Music
2,3,MrBeast,166000000,28368841870,Entertainment,Mrbeast,741,United States,Entertainment
3,4,Cocomelon - Nursery Rhymes,162000000,164000000000,Education,Cocomelon - Nursery Rhymes,966,United States,Education
4,5,SET India,159000000,148000000000,Shows,Set India,116536,India,Entertainment
6,7,Kids Diana Show,112000000,93247040539,People & Blogs,Kids Diana Show,1111,United States,Entertainment
7,8,PewDiePie,111000000,29058044447,Gaming,Pewdiepie,4716,Japan,Entertainment
8,9,Like Nastya,106000000,90479060027,People & Blogs,Like Nastya Vlog,493,Russia,People
9,10,Vlad and Niki,98900000,77180169894,Entertainment,Vlad And Niki,574,United States,Entertainment
10,11,Zee Music Company,96700000,57856289381,Music,Zee Music Company,8548,India,Music
11,12,WWE,96000000,77428473662,Sports,Wwe,70127,United States,Sports


In [106]:
# Simpan ke CSV
df_final.to_csv("youtubers_clean.csv", index=False, encoding="utf-8-sig")
print("File CSV berhasil dibuat: youtubers_clean.csv")


File CSV berhasil dibuat: youtubers_clean.csv
