In [13]:
import pandas as pd
import re

In [14]:
# Load the data - ensure master_task1_datset.csv is uploaded to your Colab environment
file_path = 'master_task1_datset.csv'
df = pd.read_csv(file_path)


In [15]:
# --- 1. Text Cleaning Function (for Title, Description, and Uniqueness Check) ---
def clean_text(text):
    if pd.isna(text) or text is None:
        return ''
    text = str(text)

    # 1. Remove HTML tags (<br>, <i>)
    text = re.sub(r'<[^>]+>', '', text)

    # 2. Remove Emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F" "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF" "\U0001F900-\U0001F9FF"
        "\U0001FA00-\U0001FAFF" "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # 3. Remove Special characters/Punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

    # 4. Convert to lowercase
    text = text.lower()

    # 5. Strip extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [16]:
# Apply text cleaning to 'title' and 'description'
df['cleaned_title'] = df['title'].apply(clean_text)
df['cleaned_description'] = df['description'].apply(clean_text)


In [17]:
# --- 2. Convert ISO 8601 Duration to Total Seconds (using regex) ---
def iso8601_to_seconds(duration_str):
    if pd.isna(duration_str) or not isinstance(duration_str, str):
        return 0

    # Regex to extract components (H, M, S) from ISO 8601 format like 'PT#H#M#S'
    regex = r'P(T)?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?'
    match = re.match(regex, duration_str)

    if not match:
        return 0

    # Extracted groups are Hour, Minute, Second (None if not present)
    hours, minutes, seconds = [int(g) if g else 0 for g in match.groups()[1:]]

    total_seconds = (hours * 3600) + (minutes * 60) + seconds
    return total_seconds

In [18]:
df['duration_seconds'] = df['duration'].apply(iso8601_to_seconds)


In [19]:
# --- 3. Convert Count Columns to Numeric Integers (using camelCase names for master_task1_datset.csv) ---
count_cols = ['viewCount', 'likeCount', 'commentCount', 'channel_subscriberCount', 'channel_videoCount']

for col in count_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].fillna(0)
    df[col] = df[col].astype(int)

In [20]:
# --- 4. Process Tags Column ---
df['tags_processed'] = df['tags'].fillna('').astype(str).str.lower().str.strip()


In [21]:
# --- 5. Uniqueness Check (check for duplicates across video_id) ---
title_uniqueness_check = df.groupby('cleaned_title')['id'].nunique().reset_index(name='unique_video_id_count')
duplicates_by_title = title_uniqueness_check[title_uniqueness_check['unique_video_id_count'] > 1]

In [22]:
# --- Display Results and Finalize DataFrame ---
print("--- Data Types After Cleaning and Conversion ---")
print(df[count_cols + ['duration_seconds']].dtypes)

print("\n--- Head of Transformed Data ---")

--- Data Types After Cleaning and Conversion ---
viewCount                  int64
likeCount                  int64
commentCount               int64
channel_subscriberCount    int64
channel_videoCount         int64
duration_seconds           int64
dtype: object

--- Head of Transformed Data ---


In [23]:
# Displaying original and new columns for comparison
print(df[['title', 'cleaned_title', 'duration', 'duration_seconds', 'viewCount', 'likeCount']].head())

print("\n--- Titles with Multiple Unique Video IDs (Duplicates Check) ---")
print(duplicates_by_title)

                                               title  \
0  Data Analyst Jobs are COOKED ð¤¯ This is how ...   
1   H1B is $100K now: How will Big Tech Survive ð¤¯   
2                    Data Engineer vs Data Scientist   
3  Is Big Tech a trap? Why she quit her $200K Sof...   
4                   5 Data Analyst Projects You NEED   

                                       cleaned_title  duration  \
0  data analyst jobs are cooked this is how you s...   PT1M17S   
1          h1b is 100k now how will big tech survive   PT1M34S   
2                    data engineer vs data scientist     PT57S   
3  is big tech a trap why she quit her 200k softw...  PT22M21S   
4                   5 data analyst projects you need     PT58S   

   duration_seconds  viewCount  likeCount  
0                77       7567        402  
1                94       8017        196  
2                57       2392        130  
3              1341       2385         69  
4                58      11224        625  

-

In [24]:
# Save the final cleaned DataFrame
df.to_csv('master_task1_fully_cleaned.csv', index=False)