## DATA CLEANING

In [1]:
import pandas as pd

In [4]:
df = pd.read_csv('datasets/wonderfulyakubu3599.csv', parse_dates=['date_released_utc'])

In [5]:
df.head()

Unnamed: 0,video_title,video_duration,date_released_utc,view_count,like_count,comment_count,date_collected_utc
0,THIS EARLY 20s DEY TRY WHINE ME BUT I NO GO PA...,PT22M16S,2024-07-07 11:26:12+00:00,331,21,11,2024-08-14 08:35:30.644954+00:00
1,THEY DID'NT EVEN MAKE THIS MUCH FOOD AT THE LA...,PT30M58S,2024-06-26 23:26:58+00:00,482,18,15,2024-08-14 08:35:31.133887+00:00
2,WENT ON A TRIP AND FELL IN LOVE🤭,PT52M25S,2024-06-18 17:00:03+00:00,100,21,7,2024-08-14 08:35:31.655587+00:00
3,MY BEST FRIEND THINKS I'M AUTHORITATIVE💔,PT52M3S,2024-06-07 09:00:21+00:00,64,16,1,2024-08-14 08:35:32.107558+00:00
4,JOLLOF RICE RECIPE,PT4M5S,2024-05-27 15:54:17+00:00,629,34,8,2024-08-14 08:35:32.540728+00:00


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   video_title     28 non-null     object             
 1   video_duration  28 non-null     object             
 2   date_released   28 non-null     datetime64[ns, UTC]
 3   view_count      28 non-null     int64              
 4   like_count      28 non-null     int64              
 5   comment_count   28 non-null     int64              
dtypes: datetime64[ns, UTC](1), int64(3), object(2)
memory usage: 1.4+ KB


In [6]:
for column in df.columns:
    print(f'{"-"*20}{column}{"-"*20}')
    print(df[column].unique())
    print(f'{"-"*20}{"-"*20}')

--------------------video_title--------------------
['THIS EARLY 20s DEY TRY WHINE ME BUT I NO GO PANIC// LIFE IN MY 20s (Ep.9)'
 "THEY DID'NT EVEN MAKE THIS MUCH FOOD AT THE LAST SUPPER🤭 Amah Chill!"
 'WENT ON A TRIP AND FELL IN LOVE🤭'
 "MY BEST FRIEND THINKS I'M AUTHORITATIVE💔" 'JOLLOF RICE RECIPE'
 'THIS LOVE MAKES ME A HAPPY MU**🤭// UNBOXING MY BIRTHDAY GIFTS//LIFE IN MY 20s (EP.8)'
 "BirthDay VLOG// Prepping// LIFE IN MY 20's (Ep. 7)"
 "I ASKED SOME PEOPLE WHO JESUS CHRIST IS TO THEM AND HERE'S WHAT THEY HAD TO SAY🥺"
 "THE DEVIL TRIED BUT JESUS IS THE GREATEST😌// LIFE IN MY 20's (Ep. 6)"
 'The Perfect Base For You Stew and Rice Dishes'
 'LIFE IN MY 20s (Ep. 5)// My life as an amazing aunty to my beautiful babies'
 'Detailed Nigerian Zobo Recipe// Healthy But Delicious Nigerian Drink'
 "LIFE IN MY 20s// It definitely isn't a weekly vlog if it isn't up to a week 😌"
 'Daily Morning Skincare Routine// LIFE IN MY 20s (Ep. 3)🤭'
 'A Day (or two) In My Life// LIFE IN MY 20S✨ (Ep. 2)'
 'Da

There are 28 rows in the dataframe with 6 columns:
1. **video_title**: This is the title of the youtube video. There are no missing values.
2. **video_duration**: This is the length of the **video_title**. 
It is in the format 'PT(Time in minutes)M(Time in seconds)S'. Minutes and seconds will be extracted and placed in their different columns. There are no missing values. 
3. **date_released_utc**: This is the date the video was released on youtube in the UTC timezone. There are no missing values.
4. **view_count**: This is the number of views the **video_title** has. There are no missing values.
5. **like_count**: This is the number of likes the **video_title** has. There are no missing values.
6. **comment_count**:  This is the number of comments the **video_title** has. There are no missing values.
7. **date_collected_utc**: This is the date the video was extracted from the website in UTC timezone. There are no missing values.

In [9]:
def get_digits(string):
    char_list = [char for char in string if char.isdigit()]
    return ''.join(char_list)

In [10]:
func = lambda duration: duration.split('H')[0]
print(get_digits(func('PT1H30M58S')))

1


In [17]:
df['video_duration_H'] = df['video_duration'].apply(
        lambda duration: duration.split('H')[0] if (duration.find('H') > 0)  else '0' 
        ).apply(get_digits)

In [13]:
df['video_duration_M'] = df['video_duration'].apply(
        lambda duration: duration.split('M')[0] if (duration.find('M') > 0)  else '0'
        ).apply(get_digits)

In [14]:
df['video_duration_S'] = df['video_duration'].apply(
        lambda duration: duration.split('M')[1]
        ).apply(get_digits)

In [18]:
for column in ['video_duration_H', 'video_duration_M', 'video_duration_S']:
    print(f'{"-"*20}{column}{"-"*20}')
    print(df[column].unique())
    print(f'{"-"*20}{"-"*20}')

--------------------video_duration_H--------------------
['0']
----------------------------------------
--------------------video_duration_M--------------------
['22' '30' '52' '4' '27' '34' '9' '36' '11' '41' '49' '14' '12' '20' '33'
 '28' '24' '46' '23' '18' '32' '13']
----------------------------------------
--------------------video_duration_S--------------------
['16' '58' '25' '3' '5' '45' '28' '2' '26' '23' '21' '' '44' '6' '9' '10'
 '43' '7' '52' '8' '35']
----------------------------------------


An empty string can be seen in the **video_duration_s** column. That value will be changed to zero. This shall also be applied to the **video_duration_m**. 

In [11]:
# fg
df = df.replace({'video_duration_H': '', 
                 'video_duration_M': '', 
                 'video_duration_S': ''}, '0')

In [12]:
# Convert the string columns to int columns
df = df.astype({'video_duration_m': int, 'video_duration_s': int})


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   video_title       28 non-null     object             
 1   video_duration    28 non-null     object             
 2   date_released     28 non-null     datetime64[ns, UTC]
 3   view_count        28 non-null     int64              
 4   like_count        28 non-null     int64              
 5   comment_count     28 non-null     int64              
 6   video_duration_m  28 non-null     int32              
 7   video_duration_s  28 non-null     int32              
dtypes: datetime64[ns, UTC](1), int32(2), int64(3), object(2)
memory usage: 1.7+ KB


In [14]:
# Removed the video_duration column
df = df.drop(columns=['video_duration'])

In [19]:
print(help(df.to_csv))

Help on method to_csv in module pandas.core.generic:

to_csv(path_or_buf: 'FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None' = None, sep: 'str' = ',', na_rep: 'str' = '', float_format: 'str | Callable | None' = None, columns: 'Sequence[Hashable] | None' = None, header: 'bool_t | list[str]' = True, index: 'bool_t' = True, index_label: 'IndexLabel | None' = None, mode: 'str' = 'w', encoding: 'str | None' = None, compression: 'CompressionOptions' = 'infer', quoting: 'int | None' = None, quotechar: 'str' = '"', lineterminator: 'str | None' = None, chunksize: 'int | None' = None, date_format: 'str | None' = None, doublequote: 'bool_t' = True, escapechar: 'str | None' = None, decimal: 'str' = '.', errors: 'str' = 'strict', storage_options: 'StorageOptions' = None) -> 'str | None' method of pandas.core.frame.DataFrame instance
    Write object to a comma-separated values (csv) file.
    
    Parameters
    ----------
    path_or_buf : str, path object, file-like object, or None, defaul

In [16]:
df.to_csv('wonderfulyakubu3599_cleaned.csv', index=True)
df.read_csv('')
# This file stores the data, as well as the data type
# df.to_parquet('wonderfulyakubu3599_cleaned.gzip')

In [None]:
def get_digits(string):
    char_list = [char for char in string if char.isdigit()]
    return ''.join(char_list)

def data_cleaning(csv_file_name):
    filename = csv_file_name.replace('.csv', '')
    df = pd.read_csv(csv_file_name)
    df['video_duration_m'] = df['video_duration'].apply(
        lambda duration: duration.split('M')[0]
        ).apply(get_digits)
    df['video_duration_s'] = df['video_duration'].apply(
        lambda duration: duration.split('M')[1]
        ).apply(get_digits)
    df['video_duration_m'] = df['video_duration_m'].replace('', '0')
    df['video_duration_s'] = df['video_duration_s'].replace('', '0')
    df = df.astype({'video_duration_m': int, 'video_duration_s': int})
    df = df.drop(columns=['video_duration'])
    df.to_csv(f'datasets/{filename}_cleaned.csv')
    # This file stores the data, as well as the data type
    df.to_parquet(f'datasets/{filename}_cleaned.gzip')

    

    
    
    