In [1]:
import pandas as pd
import os

In [2]:
os.chdir('..')

In [3]:
base_folder = 'data'

In [9]:
[f.path for f in os.scandir(base_folder) if f.is_dir()]

['data/Cities',
 'data/Subscription source',
 'data/Operating system',
 'data/New and returning viewers',
 'data/Viewer age',
 'data/Geography',
 'data/Subtitles and CC',
 'data/Subscription status',
 'data/Content type',
 'data/Viewer gender',
 'data/Viewership by Date',
 'data/Device type',
 'data/Sharing service',
 'data/Traffic source']

In [10]:
# Get a list of all subfolders in the base folder
subfolders = [f.path for f in os.scandir(base_folder) if f.is_dir()]

In [11]:
all_merged_datas = []  # List to store merged dataframes

In [17]:
# Iterate through each subfolder
for subfolder in subfolders:
    try:
        # Construct the file paths for each CSV within the subfolder
        totals_path = os.path.join(subfolder, 'Totals.csv')
        print(totals_path)
        chart_path = os.path.join(subfolder, 'Chart data.csv')
        print(chart_path)
        table_path = os.path.join(subfolder, 'Table data.csv')
        print(table_path)

        # Read CSV files with error handling
        try:
            totals_data = pd.read_csv(totals_path)
        except FileNotFoundError:
            totals_data = pd.DataFrame({'Date': [], 'Views': []})

        try:
            chart_data = pd.read_csv(chart_path)
        except FileNotFoundError:
            chart_data = pd.DataFrame({'Date': [], 'Cities': [], 'City name': [], 'Views': []})

        try:
            table_data = pd.read_csv(table_path)
        except FileNotFoundError:
            table_data = pd.DataFrame({'Cities': [], 'City name': [], 'Geography': [], 'Views': [],
                                       'Watch time (hours)': [], 'Average view duration': []})

        # Find common columns dynamically
        common_columns = set(chart_data.columns) & set(table_data.columns)
        print(f"common columns between chart_data and table_data: {common_columns}")

        # Check if there are common columns
        if common_columns:
            # Merge "chart_data" with "table_data" based on the common columns 
            merged_data = pd.merge(chart_data, table_data, on=list(common_columns), how='left')

            # Merge the result with the "Totals" data based on the "Date" column
            final_data = pd.merge(merged_data, totals_data, on='Date', how='left')
        
            # Append the merged dataframe to the list
            all_merged_datas.append(final_data)

            # Display the final merged data for the current subfolder
            print(f'\nMerged Data for {subfolder}:')
            print(final_data)

            # Save the merged data to a new CSV file within each subfolder
            output_path = os.path.join(subfolder, 'merged_data.csv')
            final_data.to_csv(output_path, index=False)
        else:
            print(f"Skipping {subfolder}: No common columns found between chart_data and table_data.")

    except Exception as e:
        print(f"Error processing {subfolder}: {str(e)}")

print("\nProcessing complete.")

data/Cities/Totals.csv
data/Cities/Chart data.csv
data/Cities/Table data.csv
common columns between chart_data and table_data: {'Views', 'Cities', 'City name'}

Merged Data for data/Cities:
            Date                                 Cities    City name  Views_x  \
0     2020-06-28  0x164b85cef5ab402d:0x8467b6b037a24d49  Addis Ababa        0   
1     2020-06-29  0x164b85cef5ab402d:0x8467b6b037a24d49  Addis Ababa        0   
2     2020-06-30  0x164b85cef5ab402d:0x8467b6b037a24d49  Addis Ababa        0   
3     2020-07-01  0x164b85cef5ab402d:0x8467b6b037a24d49  Addis Ababa        0   
4     2020-07-02  0x164b85cef5ab402d:0x8467b6b037a24d49  Addis Ababa        0   
...          ...                                    ...          ...      ...   
6390  2023-12-24  0x487a4d4c5226f5db:0xd9be143804fe6baa   Manchester        0   
6391  2023-12-25  0x487a4d4c5226f5db:0xd9be143804fe6baa   Manchester        0   
6392  2023-12-26  0x487a4d4c5226f5db:0xd9be143804fe6baa   Manchester        0   


In [13]:
# Concatenate all merged datas into a single dataframe
if all_merged_datas:
    all_merged_data = pd.concat(all_merged_datas, ignore_index=True)

    # Display the final concatenated dataframe
    print("\nConcatenated Data:")
    print(all_merged_data)

    # Save the concatenated dataframe to a new CSV file
    all_output_path = os.path.join("data", 'all_merged_data.csv')
    all_merged_data.to_csv(all_output_path, index=False)
else:
    print("No common columns found in any subfolder.")


Concatenated Data:
              Date                                 Cities    City name  \
0       2020-06-28  0x164b85cef5ab402d:0x8467b6b037a24d49  Addis Ababa   
1       2020-06-29  0x164b85cef5ab402d:0x8467b6b037a24d49  Addis Ababa   
2       2020-06-30  0x164b85cef5ab402d:0x8467b6b037a24d49  Addis Ababa   
3       2020-07-01  0x164b85cef5ab402d:0x8467b6b037a24d49  Addis Ababa   
4       2020-07-02  0x164b85cef5ab402d:0x8467b6b037a24d49  Addis Ababa   
...            ...                                    ...          ...   
121500  2023-12-24                                    NaN          NaN   
121501  2023-12-25                                    NaN          NaN   
121502  2023-12-26                                    NaN          NaN   
121503  2023-12-27                                    NaN          NaN   
121504  2023-12-28                                    NaN          NaN   

        Views_x Geography Geography.1  Watch time (hours)  \
0           0.0       NaN     

In [19]:
all_merged_data

Unnamed: 0,Date,Cities,City name,Views_x,Geography,Geography.1,Watch time (hours),Average view duration,Views_y,Subscription source,...,Subtitles and CC,Subscription status,Content type,Device type,Sharing service,Shares_x,Shares_y,Traffic source,Impressions,Impressions click-through rate (%)
0,2020-06-28,0x164b85cef5ab402d:0x8467b6b037a24d49,Addis Ababa,0.0,,,,,1.0,,...,,,,,,,,,,
1,2020-06-29,0x164b85cef5ab402d:0x8467b6b037a24d49,Addis Ababa,0.0,,,,,72.0,,...,,,,,,,,,,
2,2020-06-30,0x164b85cef5ab402d:0x8467b6b037a24d49,Addis Ababa,0.0,,,,,76.0,,...,,,,,,,,,,
3,2020-07-01,0x164b85cef5ab402d:0x8467b6b037a24d49,Addis Ababa,0.0,,,,,70.0,,...,,,,,,,,,,
4,2020-07-02,0x164b85cef5ab402d:0x8467b6b037a24d49,Addis Ababa,0.0,,,,,57.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121500,2023-12-24,,,1.0,,,,,36.0,,...,,,,,,,,Notifications,,
121501,2023-12-25,,,2.0,,,,,24.0,,...,,,,,,,,Notifications,,
121502,2023-12-26,,,4.0,,,,,57.0,,...,,,,,,,,Notifications,,
121503,2023-12-27,,,0.0,,,,,8.0,,...,,,,,,,,Notifications,,


# Reflexion on our merge data

In [21]:
all_merged_data.isnull().sum()

Date                                       0
Cities                                115110
City name                             115110
Views_x                                21743
Geography                              85690
Geography.1                           121502
Watch time (hours)                    118930
Average view duration                 121488
Views_y                                21743
Subscription source                   111273
Subscribers_x                         111273
Subscribers gained                    121503
Subscribers lost                      121503
Subscribers_y                         111273
Operating system                      101041
New and returning viewers             117668
Subtitles and CC                      112552
Subscription status                   118947
Content type                          118947
Device type                           116389
Sharing service                       109994
Shares_x                              109994
Shares_y  

In [28]:
def missing_values_percentage(df):
    # Count the total number of missing values for each column
    total_missing = df.isnull().sum()

    # Calculate the total number of cells in the DataFrame
    total_cells = df.size

    # Calculate the percentage of missing values for each column
    missing_percentage = (total_missing / total_cells) * 100

    # Filter columns with non-zero missing percentage and sort in descending order
    missing_percentage = missing_percentage[missing_percentage > 0].sort_values(ascending=False)

    return missing_percentage

In [29]:
missing_values_percentage(all_merged_data)

Impressions click-through rate (%)    3.846154
Impressions                           3.846154
Subscribers gained                    3.846091
Subscribers lost                      3.846091
Geography.1                           3.846059
Average view duration                 3.845616
Content type                          3.765182
Subscription status                   3.765182
Watch time (hours)                    3.764644
New and returning viewers             3.724696
Device type                           3.684211
City name                             3.643725
Cities                                3.643725
Subtitles and CC                      3.562753
Subscribers_x                         3.522267
Subscription source                   3.522267
Subscribers_y                         3.522267
Sharing service                       3.481781
Shares_x                              3.481781
Shares_y                              3.481781
Traffic source                        3.481781
Operating sys