## Here we are going to merge the different data:
#### Prerequisites:
- The data has to have been cleaned and preprocessed. By reducing to the same number of rows
- The preprocessed data is to be found in each respective folder under clean folder


In [20]:
import pandas as pd
from src.utils import fetch_data, preprocess_data, generate_dataframes

In [21]:
# Define the column names and values 
value_columns = {'Content type': 'Views', 'Device type': 'Views', 'Geography': 'Views', 'New and returning viewers': 'Views', 'Operating system': 'Views', 'Sharing service': 'Shares'}


In [22]:
# Preprocess the data
cities_clean_chart_df = fetch_data('Cities', 'Chart data', True)

In [23]:
reshaped_dataframes = generate_dataframes(value_columns, clean_data=True)

### Merge the data on date

In [24]:
def merge_dataframes(reshaped_dataframes, value_columns):
    # Initialize merged_df with the first dataframe
    merged_df = reshaped_dataframes[list(value_columns.keys())[0]]

    # Merge all other dataframes
    for name in list(value_columns.keys())[1:]:
        merged_df = pd.merge(merged_df, reshaped_dataframes[name], on='Date', how='outer')

    return merged_df

merged_df = merge_dataframes(reshaped_dataframes, value_columns)

In [25]:
merged_df.head()

Unnamed: 0,Date,Other_x,Videos,Computer,Mobile phone,TV,Tablet,Unnamed: 1,BJ,BR,...,iOS,Copy to Clipboard,Facebook,Gmail,LinkedIn,Other_y,Share to WhatsApp Business,Text Message,Twitter,WhatsApp
0,2020-06-28,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-06-29,0,72,49,23,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2020-06-30,0,76,62,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-07-01,0,70,46,24,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020-07-02,0,57,29,28,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
def rename_columns(dataframes, value_columns):
    renamed_dataframes = {}
    for name, value_column in value_columns.items():
        # Copy the dataframe to avoid modifying the original one
        df = dataframes[name].copy()

        # Rename the columns
        df.columns = [f"{name}_{col}" if col != 'Date' else col for col in df.columns]

        renamed_dataframes[name] = df

    return renamed_dataframes

# Rename the columns of the dataframes
renamed_dataframes = rename_columns(reshaped_dataframes, value_columns)

# Now you can merge the renamed dataframes
merged_df = merge_dataframes(renamed_dataframes, value_columns)

In [27]:
merged_df.columns = [col.lower().replace(' ', '_') for col in merged_df.columns]

In [28]:
merged_df.columns

Index(['date', 'content_type_other', 'content_type_videos',
       'device_type_computer', 'device_type_mobile_phone', 'device_type_tv',
       'device_type_tablet', 'geography_unnamed:_1', 'geography_bj',
       'geography_br', 'geography_ch', 'geography_de', 'geography_es',
       'geography_et', 'geography_fr', 'geography_gb', 'geography_gh',
       'geography_hu', 'geography_id', 'geography_in', 'geography_it',
       'geography_jp', 'geography_ke', 'geography_kr', 'geography_lk',
       'geography_ng', 'geography_nl', 'geography_ph', 'geography_ro',
       'geography_rs', 'geography_rw', 'geography_sd', 'geography_sn',
       'geography_tg', 'geography_us', 'geography_za',
       'new_and_returning_viewers_new_viewers',
       'new_and_returning_viewers_returning_viewers',
       'new_and_returning_viewers_unknown', 'operating_system_amazon_fire_os',
       'operating_system_android', 'operating_system_apple_tvos',
       'operating_system_chrome_os', 'operating_system_chromecast'

In [29]:

merged_df.head()

Unnamed: 0,date,content_type_other,content_type_videos,device_type_computer,device_type_mobile_phone,device_type_tv,device_type_tablet,geography_unnamed:_1,geography_bj,geography_br,...,operating_system_ios,sharing_service_copy_to_clipboard,sharing_service_facebook,sharing_service_gmail,sharing_service_linkedin,sharing_service_other,sharing_service_share_to_whatsapp_business,sharing_service_text_message,sharing_service_twitter,sharing_service_whatsapp
0,2020-06-28,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-06-29,0,72,49,23,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2020-06-30,0,76,62,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-07-01,0,70,46,24,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020-07-02,0,57,29,28,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Save the merged data to csv

In [33]:
merged_df.to_csv('../../data/youtube_chart_data.csv', index=False)

In [32]:
merged_df.head()

Unnamed: 0,date,content_type_other,content_type_videos,device_type_computer,device_type_mobile_phone,device_type_tv,device_type_tablet,geography_unnamed:_1,geography_bj,geography_br,...,operating_system_ios,sharing_service_copy_to_clipboard,sharing_service_facebook,sharing_service_gmail,sharing_service_linkedin,sharing_service_other,sharing_service_share_to_whatsapp_business,sharing_service_text_message,sharing_service_twitter,sharing_service_whatsapp
0,2020-06-28,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-06-29,0,72,49,23,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2020-06-30,0,76,62,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-07-01,0,70,46,24,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020-07-02,0,57,29,28,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
merged_df.tail()

Unnamed: 0,Date,Content type_Other,Content type_Videos,Device type_Computer,Device type_Mobile phone,Device type_TV,Device type_Tablet,Geography_Unnamed: 1,Geography_BJ,Geography_BR,...,Operating system_iOS,Sharing service_Copy to Clipboard,Sharing service_Facebook,Sharing service_Gmail,Sharing service_LinkedIn,Sharing service_Other,Sharing service_Share to WhatsApp Business,Sharing service_Text Message,Sharing service_Twitter,Sharing service_WhatsApp
1274,2023-12-24,0,36,17,19,0,0,0,0,0,...,11,0,0,0,0,0,0,0,0,0
1275,2023-12-25,0,33,17,16,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1276,2023-12-26,0,57,33,21,3,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
1277,2023-12-27,0,8,4,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1278,2023-12-28,0,25,24,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### save data to database using sqlalchemy

In [12]:
from scripts.save_to_db import save_to_database

# Use the function
db_string = 'postgresql://postgres:postgres@postgres:5432/youtube_data'
table_name = 'youtube_data'
save_to_database(merged_df, table_name, db_string)

AttributeError: 'Engine' object has no attribute 'cursor'

In [None]:
import psycopg2
import csv

def create_pg_psycopg2_conn(user, password, host, port, db):
    conn = psycopg2.connect(
        dbname=db,
        user=user,
        password=password,
        host=host,
        port=port
    )
    return conn

def save_csv_to_db(csv_file_path, table_name):
    # create a connection to the database using psycopg2
    conn = create_pg_psycopg2_conn('postgres', 'postgres', 'localhost', '15432', 'youtube_data')

    # Create a cursor object
    cur = conn.cursor()

    with open(csv_file_path, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip the header row
        for row in reader:
            cur.execute(
                "INSERT INTO {} VALUES (%s, %s, %s)".format(table_name),
                row
            )

    # Commit the transaction
    conn.commit()

    # Close the cursor and connection
    cur.close()
    conn.close()

# Use the function
save_csv_to_db('../data/merged_chart_data.csv', 'youtube_d')

In [None]:
merged_df.info()

In [None]:
merged_df.columns