## Here we are going to merge the different data:
#### Prerequisites:
- The data has to have been cleaned and preprocessed. By reducing to the same number of rows
- The preprocessed data is to be found in each respective folder under clean folder


In [1]:
import pandas as pd
from src.utils import fetch_data, preprocess_data, generate_dataframes

In [2]:
# Define the column names and values 
value_columns = {'Content type': 'Views', 'Device type': 'Views', 'Geography': 'Views', 'New and returning viewers': 'Views', 'Operating system': 'Views', 'Sharing service': 'Shares'}


In [3]:
# Preprocess the data
cities_clean_chart_df = fetch_data('Cities', 'Chart data', True)

In [4]:
reshaped_dataframes = generate_dataframes(value_columns, clean_data=True)

### Merge the data on date

In [5]:
def merge_dataframes(reshaped_dataframes, value_columns):
    # Initialize merged_df with the first dataframe
    merged_df = reshaped_dataframes[list(value_columns.keys())[0]]

    # Merge all other dataframes
    for name in list(value_columns.keys())[1:]:
        merged_df = pd.merge(merged_df, reshaped_dataframes[name], on='Date', how='outer')

    return merged_df

merged_df = merge_dataframes(reshaped_dataframes, value_columns)

In [6]:
merged_df.head()

Unnamed: 0,Date,Other_x,Videos,Computer,Mobile phone,TV,Tablet,Unnamed: 1,BJ,BR,...,iOS,Copy to Clipboard,Facebook,Gmail,LinkedIn,Other_y,Share to WhatsApp Business,Text Message,Twitter,WhatsApp
0,2020-06-28,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-06-29,0,72,49,23,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2020-06-30,0,76,62,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-07-01,0,70,46,24,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020-07-02,0,57,29,28,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
def rename_columns(dataframes, value_columns):
    renamed_dataframes = {}
    for name, value_column in value_columns.items():
        # Copy the dataframe to avoid modifying the original one
        df = dataframes[name].copy()

        # Rename the columns
        df.columns = [f"{name}_{col}" if col != 'Date' else col for col in df.columns]

        renamed_dataframes[name] = df

    return renamed_dataframes

# Rename the columns of the dataframes
renamed_dataframes = rename_columns(reshaped_dataframes, value_columns)

# Now you can merge the renamed dataframes
merged_df = merge_dataframes(renamed_dataframes, value_columns)

# Save the merged data to csv

In [17]:
merged_df.to_csv('../data/merged_chart_data.csv', index=False)

In [8]:
merged_df.head()

Unnamed: 0,Date,Content type_Other,Content type_Videos,Device type_Computer,Device type_Mobile phone,Device type_TV,Device type_Tablet,Geography_Unnamed: 1,Geography_BJ,Geography_BR,...,Operating system_iOS,Sharing service_Copy to Clipboard,Sharing service_Facebook,Sharing service_Gmail,Sharing service_LinkedIn,Sharing service_Other,Sharing service_Share to WhatsApp Business,Sharing service_Text Message,Sharing service_Twitter,Sharing service_WhatsApp
0,2020-06-28,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-06-29,0,72,49,23,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2020-06-30,0,76,62,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-07-01,0,70,46,24,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020-07-02,0,57,29,28,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
merged_df.tail()

Unnamed: 0,Date,Content type_Other,Content type_Videos,Device type_Computer,Device type_Mobile phone,Device type_TV,Device type_Tablet,Geography_Unnamed: 1,Geography_BJ,Geography_BR,...,Operating system_iOS,Sharing service_Copy to Clipboard,Sharing service_Facebook,Sharing service_Gmail,Sharing service_LinkedIn,Sharing service_Other,Sharing service_Share to WhatsApp Business,Sharing service_Text Message,Sharing service_Twitter,Sharing service_WhatsApp
1274,2023-12-24,0,36,17,19,0,0,0,0,0,...,11,0,0,0,0,0,0,0,0,0
1275,2023-12-25,0,33,17,16,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1276,2023-12-26,0,57,33,21,3,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
1277,2023-12-27,0,8,4,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1278,2023-12-28,0,25,24,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### save data to database using sqlalchemy

In [22]:
from scripts.save_to_db import save_to_database

# Use the function
db_string = 'postgresql://postgres:postgres@localhost:15432/youtube_data'
table_name = 'youtube_data'
save_to_database(merged_df, table_name, db_string)

AttributeError: 'Engine' object has no attribute 'cursor'

In [20]:
import psycopg2
import csv

def create_pg_psycopg2_conn(user, password, host, port, db):
    conn = psycopg2.connect(
        dbname=db,
        user=user,
        password=password,
        host=host,
        port=port
    )
    return conn

def save_csv_to_db(csv_file_path, table_name):
    # create a connection to the database using psycopg2
    conn = create_pg_psycopg2_conn('postgres', 'postgres', 'localhost', '15432', 'youtube_data')

    # Create a cursor object
    cur = conn.cursor()

    with open(csv_file_path, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip the header row
        for row in reader:
            cur.execute(
                "INSERT INTO {} VALUES (%s, %s, %s)".format(table_name),
                row
            )

    # Commit the transaction
    conn.commit()

    # Close the cursor and connection
    cur.close()
    conn.close()

# Use the function
save_csv_to_db('../data/merged_chart_data.csv', 'youtube_d')

TypeError: not all arguments converted during string formatting

In [24]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1279 entries, 0 to 1278
Data columns (total 64 columns):
 #   Column                                       Non-Null Count  Dtype 
---  ------                                       --------------  ----- 
 0   Date                                         1279 non-null   object
 1   Content type_Other                           1279 non-null   int64 
 2   Content type_Videos                          1279 non-null   int64 
 3   Device type_Computer                         1279 non-null   int64 
 4   Device type_Mobile phone                     1279 non-null   int64 
 5   Device type_TV                               1279 non-null   int64 
 6   Device type_Tablet                           1279 non-null   int64 
 7   Geography_Unnamed: 1                         1279 non-null   int64 
 8   Geography_BJ                                 1279 non-null   int64 
 9   Geography_BR                                 1279 non-null   int64 
 10  Geography_CH

In [25]:
merged_df.columns

Index(['Date', 'Content type_Other', 'Content type_Videos',
       'Device type_Computer', 'Device type_Mobile phone', 'Device type_TV',
       'Device type_Tablet', 'Geography_Unnamed: 1', 'Geography_BJ',
       'Geography_BR', 'Geography_CH', 'Geography_DE', 'Geography_ES',
       'Geography_ET', 'Geography_FR', 'Geography_GB', 'Geography_GH',
       'Geography_HU', 'Geography_ID', 'Geography_IN', 'Geography_IT',
       'Geography_JP', 'Geography_KE', 'Geography_KR', 'Geography_LK',
       'Geography_NG', 'Geography_NL', 'Geography_PH', 'Geography_RO',
       'Geography_RS', 'Geography_RW', 'Geography_SD', 'Geography_SN',
       'Geography_TG', 'Geography_US', 'Geography_ZA',
       'New and returning viewers_New viewers',
       'New and returning viewers_Returning viewers',
       'New and returning viewers_Unknown', 'Operating system_Amazon Fire OS',
       'Operating system_Android', 'Operating system_Apple tvOS',
       'Operating system_Chrome OS', 'Operating system_Chromecast'