# SVOD Data Analysis

### ***Dataset Cleaning***

---

**PATHs & Libraries**

In [3]:
from pathlib import Path

import numpy as np
import pandas as pd

BASE_DIR = Path().resolve()
DATA_RAW_DIR = BASE_DIR / "data" / "raw"
DATA_PROCESSED_DIR = BASE_DIR / "data" / "processed"

RAW_FILE = DATA_RAW_DIR / "Dataxis_Test Data Analyst 2025.xlsx"

## **ETL Process**

---

**Dataset Loading**

---

In [4]:
df_raw = pd.read_excel(RAW_FILE, sheet_name = "Data")

df_raw.head(10)

Unnamed: 0,Actor_label,Country_label,Kpi_label_corporate,Fact_date,Kpi_value
0,Viaplay,USA,SVOD subscribers,2021-12-31,6000
1,Viaplay,USA,SVOD subscribers,2022-03-31,11000
2,Viaplay,USA,SVOD subscribers,2022-06-30,21000
3,Viaplay,USA,SVOD subscribers,2022-09-30,40000
4,Viaplay,USA,SVOD subscribers,2022-12-31,66000
5,Hulu,USA,SVOD subscribers,2021-03-31,41600000
6,Hulu,USA,SVOD subscribers,2021-06-30,42800000
7,Hulu,USA,SVOD subscribers,2021-09-30,43700000
8,Hulu,USA,SVOD subscribers,2021-12-31,45200000
9,Hulu,USA,SVOD subscribers,2022-03-31,45500000


---

### Exploratory Data Analysis

---

In [5]:
print("Shape (filas, columnas):", df_raw.shape)
print("\nData Types:")
print(df_raw.dtypes)

print("\nMissing values per column:")
print(df_raw.isna().sum())

print("\nUnique values per column:")
for col in df_raw.columns:
    print(f"\nColumn: {col}")
    print(df_raw[col].unique()[:10])


Shape (filas, columnas): (951, 5)

Data Types:
Actor_label                    object
Country_label                  object
Kpi_label_corporate            object
Fact_date              datetime64[ns]
Kpi_value                       int64
dtype: object

Missing values per column:
Actor_label            0
Country_label          0
Kpi_label_corporate    0
Fact_date              0
Kpi_value              0
dtype: int64

Unique values per column:

Column: Actor_label
['Viaplay' 'Hulu' 'iQiYi' 'Fandor' 'Sling TV' 'Showtime Streaming'
 'Boomerang' 'Univision Now' 'i-ON TV' 'Shudder']

Column: Country_label
['USA']

Column: Kpi_label_corporate
['SVOD subscribers']

Column: Fact_date
<DatetimeArray>
['2021-12-31 00:00:00', '2022-03-31 00:00:00', '2022-06-30 00:00:00',
 '2022-09-30 00:00:00', '2022-12-31 00:00:00', '2021-03-31 00:00:00',
 '2021-06-30 00:00:00', '2021-09-30 00:00:00']
Length: 8, dtype: datetime64[ns]

Column: Kpi_value
[    6000    11000    21000    40000    66000 41600000 42800000

---

**Column name normalization**

---

In [6]:
df = df_raw.copy()

df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
)

df.columns

Index(['actor_label', 'country_label', 'kpi_label_corporate', 'fact_date',
       'kpi_value'],
      dtype='object')

---

**Type Conversion and Sorting**

---

In [7]:
# Convert "fact_date" to datetime
df["fact_date"] = pd.to_datetime(df["fact_date"], errors="coerce")

# Make sure that "kpi_value" is numeric
df["kpi_value"] = pd.to_numeric(df["kpi_value"], errors="coerce")

# Sort values by "actor_label" and "fact_date"
df = df.sort_values(by=["actor_label", "fact_date"]).reset_index(drop=True)

df.dtypes

actor_label                    object
country_label                  object
kpi_label_corporate            object
fact_date              datetime64[ns]
kpi_value                       int64
dtype: object

---

**Duplicate and Null Handling**

---

In [8]:
duplicate_rows = df.duplicated()
print("Number of duplicate rows:", duplicate_rows.sum())

if duplicate_rows.sum() > 0:
    df = df[~duplicate_rows].reset_index(drop=True)

# Null Check After Type Conversion
print("\nNulls after transformation:")
print(df.isna().sum())

Number of duplicate rows: 0

Nulls after transformation:
actor_label            0
country_label          0
kpi_label_corporate    0
fact_date              0
kpi_value              0
dtype: int64


---

**Save clean dataset**

---

In [9]:
clean_file = DATA_PROCESSED_DIR / "clean_data.csv"
df.to_csv(clean_file, index=False)

clean_file

WindowsPath('D:/GitHub/DataProjects/Dataxis/data/processed/clean_data.csv')