In [473]:
import pandas as pd
from sqlalchemy import create_engine, text
import sys

## Configurando pandas
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Função de conexão banco de dados

In [474]:
def connect_to_database(database):
    try:
        connection_string = f'mssql+pyodbc://localhost/{database}?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server'
        engine = create_engine(connection_string)
        return engine 
    except Exception as e:
        print("Erro ao conectar ao banco de dados:", e)
        return None

## TbAccount

In [475]:
file_path = r'C:\Users\gabri\OneDrive\Documentos\Projetos\Instagram_data\api\silver\TbCabecalho.csv'
dataset = pd.read_csv(file_path, sep='\t')
select_columns = [
    'id_account',
    'account_username',
    'account_biography',
    'profile_picture_url',
    'account_name'
]

dataset= dataset[select_columns]
dataset.head()

Unnamed: 0,id_account,account_username,account_biography,profile_picture_url,account_name
0,17841425444516188,bodemeier.digital,♟️Criação de conteúdo estratégico\n▪️+ de 5 an...,https://scontent.fcgh8-1.fna.fbcdn.net/v/t51.2...,Bodemeier Digital
1,17841417530400616,gabgalani,RJ 🔸 SP\ndata is the new oil 💻\n@carolinebodem...,https://scontent.fcgh8-1.fna.fbcdn.net/v/t51.2...,Gabriel Galani


In [476]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id_account           2 non-null      int64 
 1   account_username     2 non-null      object
 2   account_biography    2 non-null      object
 3   profile_picture_url  2 non-null      object
 4   account_name         2 non-null      object
dtypes: int64(1), object(4)
memory usage: 212.0+ bytes


In [477]:
d_type = {
    'id_account': object
}
dataset = dataset.astype(d_type)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id_account           2 non-null      object
 1   account_username     2 non-null      object
 2   account_biography    2 non-null      object
 3   profile_picture_url  2 non-null      object
 4   account_name         2 non-null      object
dtypes: object(5)
memory usage: 212.0+ bytes


Carregar para o banco

In [478]:
dataset.head()

Unnamed: 0,id_account,account_username,account_biography,profile_picture_url,account_name
0,17841425444516188,bodemeier.digital,♟️Criação de conteúdo estratégico\n▪️+ de 5 an...,https://scontent.fcgh8-1.fna.fbcdn.net/v/t51.2...,Bodemeier Digital
1,17841417530400616,gabgalani,RJ 🔸 SP\ndata is the new oil 💻\n@carolinebodem...,https://scontent.fcgh8-1.fna.fbcdn.net/v/t51.2...,Gabriel Galani


In [479]:
engine = connect_to_database('DW')
table_name = 'TbAccount'
key_column = 'id_account'

if not engine: 
    print('conexão fail')
    sys.exit()

with engine.connect() as connection:
    for i, row in dataset.iterrows():
        query = text(f"select count(*) from {table_name} WHERE {key_column} = :value")
        result = connection.execute(query, parameters=dict(value=row[key_column]))
        count = result.scalar()

        if count > 0:
            print(f'Registro com {key_column} = {row[key_column]} já existe.')
        else: 
            row.to_frame().T.to_sql(table_name, con=connection, if_exists='append', index=False)
            print(f"Registro com {key_column} = {row[key_column]} inserido com sucesso.")
    connection.commit()
    connection.close()

Registro com id_account = 17841425444516188 já existe.
Registro com id_account = 17841417530400616 já existe.


## DTbAccountInsights e FTbAccountsDayInsights e FTbAccountsLifetimeInsights

Coletando do arquivo de dia

In [480]:

file_path = r'C:\Users\gabri\OneDrive\Documentos\Projetos\Instagram_data\api\silver\TbAccontDayInsights.csv'
dataset = pd.read_csv(file_path, sep='\t')
select_columns = [
    'name',
    'period',
    'title',
    'description'
]
FTbAccountDayInsights = dataset
dataset_day_dimension = dataset[select_columns]
dataset_day_dimension.head()

Unnamed: 0,name,period,title,description
0,impressions,day,Impressões,Número total de vezes que os objetos de mídia ...
1,reach,day,Alcance,Número total de vezes que os objetos de mídia ...
2,follower_count,day,Número de seguidores,Número total de contas únicas que seguem este ...
3,email_contacts,day,Contatos de email,Número total de toques no link de email deste ...
4,phone_call_clicks,day,Cliques em ligação telefônica,Número total de toques no link de ligação dest...


Coletando do arquivo de lifetime

In [481]:
cidade_seguidores = r'C:\Users\gabri\OneDrive\Documentos\Projetos\Instagram_data\api\silver\TbAccountLifeCidadeDosSeguidores.csv'
Faixa_genero = r'C:\Users\gabri\OneDrive\Documentos\Projetos\Instagram_data\api\silver\TbAccountLifeFaixaGenero.csv'
local_pais = r'C:\Users\gabri\OneDrive\Documentos\Projetos\Instagram_data\api\silver\TbAccountLifeLocaPais.csv'
pais_seguidores = r'C:\Users\gabri\OneDrive\Documentos\Projetos\Instagram_data\api\silver\TbAccountLifePaisDosSeguidores.csv'
seguidores_on = r'C:\Users\gabri\OneDrive\Documentos\Projetos\Instagram_data\api\silver\TbAccountLifeSeguidoresOnline.csv'

list_df = [cidade_seguidores, Faixa_genero, local_pais, pais_seguidores, seguidores_on]
dfs = [] 

for df in list_df: 
    data = pd.read_csv(df, sep='\t')
    dfs.append(data)

dataset = pd.concat(dfs)
select_columns = [
    'name',
    'period',
    'title',
    'description'
]
FTbAccountLifetimeInsights = dataset
dataset_lifetime_dimension = dataset[select_columns]
dataset_lifetime_dimension.head()

Unnamed: 0,name,period,title,description
0,audience_city,lifetime,Cidade do público,Cidades dos seguidores deste perfil
1,audience_city,lifetime,Cidade do público,Cidades dos seguidores deste perfil
2,audience_city,lifetime,Cidade do público,Cidades dos seguidores deste perfil
3,audience_city,lifetime,Cidade do público,Cidades dos seguidores deste perfil
4,audience_city,lifetime,Cidade do público,Cidades dos seguidores deste perfil


Transformando os datasets em dados unicos

In [482]:
dataset_day_dimension.drop_duplicates(inplace=True)
dataset_lifetime_dimension.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_day_dimension.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_lifetime_dimension.drop_duplicates(inplace=True)


Unindo as dimensões

In [483]:
DTbAccountInsights = pd.concat([dataset_day_dimension,dataset_lifetime_dimension])
DTbAccountInsights.sort_values('name', inplace=True)
DTbAccountInsights.reset_index(inplace=True)
DTbAccountInsights['id_insight'] = DTbAccountInsights.index + 1
DTbAccountInsights = DTbAccountInsights[['id_insight'] + [col for col in DTbAccountInsights.columns if col != 'id_insight']]
DTbAccountInsights.drop(columns='index', inplace=True)
DTbAccountInsights.rename(columns={'period': 'frequencia'}, inplace=True)
DTbAccountInsights.head()

Unnamed: 0,id_insight,name,frequencia,title,description
0,1,audience_city,lifetime,Cidade do público,Cidades dos seguidores deste perfil
1,2,audience_country,lifetime,País do público,Países dos seguidores deste perfil
2,3,audience_gender_age,lifetime,Gênero e faixa etária,A distribuição por gênero e faixa etária dos s...
3,4,audience_locale,lifetime,Localização,Localidades por códigos de país dos seguidores...
4,5,email_contacts,day,Contatos de email,Número total de toques no link de email deste ...


Carregando no banco DTbAccountInsights

In [484]:
DTbAccountInsights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id_insight   14 non-null     int64 
 1   name         14 non-null     object
 2   frequencia   14 non-null     object
 3   title        14 non-null     object
 4   description  14 non-null     object
dtypes: int64(1), object(4)
memory usage: 692.0+ bytes


In [485]:
engine = connect_to_database('DW')
table_name = 'DTbAccountInsights'
key_column = 'id_insight'

if not engine: 
    print('conexão fail')
    sys.exit()

with engine.connect() as connection:
    for i, row in DTbAccountInsights.iterrows():
        query = text(f"select count(*) from {table_name} WHERE {key_column} = :value")
        result = connection.execute(query, parameters=dict(value=row[key_column]))
        count = result.scalar()

        if count > 0:
            print(f'Registro com {key_column} = {row[key_column]} já existe.')
        else: 
            row.to_frame().T.to_sql(table_name, con=connection, if_exists='append', index=False)
            print(f"Registro com {key_column} = {row[key_column]} inserido com sucesso.")
    connection.commit()
    connection.close()

Registro com id_insight = 1 já existe.
Registro com id_insight = 2 já existe.
Registro com id_insight = 3 já existe.
Registro com id_insight = 4 já existe.
Registro com id_insight = 5 já existe.
Registro com id_insight = 6 já existe.
Registro com id_insight = 7 já existe.
Registro com id_insight = 8 já existe.
Registro com id_insight = 9 já existe.
Registro com id_insight = 10 já existe.
Registro com id_insight = 11 já existe.
Registro com id_insight = 12 já existe.
Registro com id_insight = 13 já existe.
Registro com id_insight = 14 já existe.


Carregando a tabela FTbAccountDayInsights

In [486]:
FTbAccountDayInsights.head()

Unnamed: 0.1,Unnamed: 0,name,period,title,description,id,username,last_day,last_end_time,actual_day,actual_end_time,id_account,chave,extract_date,period_extraction,year,day,id_tb_midia
0,0,impressions,day,Impressões,Número total de vezes que os objetos de mídia ...,17841425444516188/insights/impressions/day,bodemeier.digital,2,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,17841425444516188,17841425444516188impressions,2024-05-22,5,2024,22,1784142544451618820240522impressionsday
1,1,reach,day,Alcance,Número total de vezes que os objetos de mídia ...,17841425444516188/insights/reach/day,bodemeier.digital,1,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,17841425444516188,17841425444516188reach,2024-05-22,5,2024,22,1784142544451618820240522reachday
2,2,follower_count,day,Número de seguidores,Número total de contas únicas que seguem este ...,17841425444516188/insights/follower_count/day,bodemeier.digital,0,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,17841425444516188,17841425444516188follower_count,2024-05-22,5,2024,22,1784142544451618820240522follower_countday
3,3,email_contacts,day,Contatos de email,Número total de toques no link de email deste ...,17841425444516188/insights/email_contacts/day,bodemeier.digital,0,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,17841425444516188,17841425444516188email_contacts,2024-05-22,5,2024,22,1784142544451618820240522email_contactsday
4,4,phone_call_clicks,day,Cliques em ligação telefônica,Número total de toques no link de ligação dest...,17841425444516188/insights/phone_call_clicks/day,bodemeier.digital,0,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,17841425444516188,17841425444516188phone_call_clicks,2024-05-22,5,2024,22,1784142544451618820240522phone_call_clicksday


In [487]:
engine = connect_to_database('DW')
table_name = 'TbAccount'
key_column = 'id_account'

if not engine: 
    print('conexão fail')
    sys.exit()

with engine.connect() as connection:
    for i, row in dataset.iterrows():
        query = text(f"select count(*) from {table_name} WHERE {key_column} = :value")
        result = connection.execute(query, parameters=dict(value=row[key_column]))
        count = result.scalar()

        if count > 0:
            print(f'Registro com {key_column} = {row[key_column]} já existe.')
        else: 
            row.to_frame().T.to_sql(table_name, con=connection, if_exists='append', index=False)
            print(f"Registro com {key_column} = {row[key_column]} inserido com sucesso.")
    connection.commit()
    connection.close()

Registro com id_account = 17841425444516188 já existe.
Registro com id_account = 17841417530400616 já existe.
Registro com id_account = 17841425444516188 já existe.
Registro com id_account = 17841417530400616 já existe.
Registro com id_account = 17841425444516188 já existe.
Registro com id_account = 17841417530400616 já existe.
Registro com id_account = 17841425444516188 já existe.
Registro com id_account = 17841417530400616 já existe.
Registro com id_account = 17841425444516188 já existe.
Registro com id_account = 17841417530400616 já existe.
Registro com id_account = 17841425444516188 já existe.
Registro com id_account = 17841417530400616 já existe.
Registro com id_account = 17841425444516188 já existe.
Registro com id_account = 17841417530400616 já existe.
Registro com id_account = 17841425444516188 já existe.
Registro com id_account = 17841417530400616 já existe.
Registro com id_account = 17841425444516188 já existe.
Registro com id_account = 17841417530400616 já existe.
Registro c

Tabela FTbAccountDayInsights

Unindo com a dimensao de DTbAccountInsights para trazer o id

In [488]:
FTbAccountDayInsights = pd.merge(FTbAccountDayInsights, DTbAccountInsights, on='name', how= 'inner')
FTbAccountDayInsights.head(1)

Unnamed: 0.1,Unnamed: 0,name,period,title_x,description_x,id,username,last_day,last_end_time,actual_day,actual_end_time,id_account,chave,extract_date,period_extraction,year,day,id_tb_midia,id_insight,frequencia,title_y,description_y
0,0,impressions,day,Impressões,Número total de vezes que os objetos de mídia ...,17841425444516188/insights/impressions/day,bodemeier.digital,2,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,17841425444516188,17841425444516188impressions,2024-05-22,5,2024,22,1784142544451618820240522impressionsday,8,day,Impressões,Número total de vezes que os objetos de mídia ...


In [489]:
select_columns = [
  'id_tb_midia',
  'id_insight',
  'id_account',
  'last_day',
  'last_end_time',
  'actual_day',
  'actual_end_time',
  'extract_date',
  'period_extraction',
  'year',
  'day'
 ]
FTbAccountDayInsights = FTbAccountDayInsights[select_columns]
FTbAccountDayInsights.head()

Unnamed: 0,id_tb_midia,id_insight,id_account,last_day,last_end_time,actual_day,actual_end_time,extract_date,period_extraction,year,day
0,1784142544451618820240522impressionsday,8,17841425444516188,2,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,2024-05-22,5,2024,22
1,1784142544451618820240522reachday,12,17841425444516188,1,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,2024-05-22,5,2024,22
2,1784142544451618820240522follower_countday,6,17841425444516188,0,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,2024-05-22,5,2024,22
3,1784142544451618820240522email_contactsday,5,17841425444516188,0,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,2024-05-22,5,2024,22
4,1784142544451618820240522phone_call_clicksday,10,17841425444516188,0,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,2024-05-22,5,2024,22


Renomeando colunas

In [490]:
columns_rename = {
    'id_tb_midia': 'id',
    'last_day': 'value_last_day',
    'actual_day': 'value_actual_day'
}
FTbAccountDayInsights.rename(columns = columns_rename, inplace=True)
FTbAccountDayInsights.head()

Unnamed: 0,id,id_insight,id_account,value_last_day,last_end_time,value_actual_day,actual_end_time,extract_date,period_extraction,year,day
0,1784142544451618820240522impressionsday,8,17841425444516188,2,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,2024-05-22,5,2024,22
1,1784142544451618820240522reachday,12,17841425444516188,1,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,2024-05-22,5,2024,22
2,1784142544451618820240522follower_countday,6,17841425444516188,0,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,2024-05-22,5,2024,22
3,1784142544451618820240522email_contactsday,5,17841425444516188,0,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,2024-05-22,5,2024,22
4,1784142544451618820240522phone_call_clicksday,10,17841425444516188,0,2024-05-21T07:00:00+0000,0,2024-05-22T07:00:00+0000,2024-05-22,5,2024,22


Tipagem de dados

In [491]:
FTbAccountDayInsights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 18 non-null     object
 1   id_insight         18 non-null     int64 
 2   id_account         18 non-null     int64 
 3   value_last_day     18 non-null     int64 
 4   last_end_time      18 non-null     object
 5   value_actual_day   18 non-null     int64 
 6   actual_end_time    18 non-null     object
 7   extract_date       18 non-null     object
 8   period_extraction  18 non-null     int64 
 9   year               18 non-null     int64 
 10  day                18 non-null     int64 
dtypes: int64(7), object(4)
memory usage: 1.7+ KB


In [492]:
# Primeiro, vamos converter as colunas de data para o tipo de dados datetime64
FTbAccountDayInsights['last_end_time'] = pd.to_datetime(FTbAccountDayInsights['last_end_time'])
FTbAccountDayInsights['actual_end_time'] = pd.to_datetime(FTbAccountDayInsights['actual_end_time'])
FTbAccountDayInsights['extract_date'] = pd.to_datetime(FTbAccountDayInsights['extract_date'])

# Agora, vamos extrair apenas a data (sem a hora) das colunas de data
FTbAccountDayInsights['last_end_time'] = FTbAccountDayInsights['last_end_time'].dt.date
FTbAccountDayInsights['actual_end_time'] = FTbAccountDayInsights['actual_end_time'].dt.date
FTbAccountDayInsights['extract_date'] = FTbAccountDayInsights['extract_date'].dt.date

type_columns = {
    'period_extraction': 'object',
    'year': 'object',
    'day': 'object'                
}

FTbAccountDayInsights = FTbAccountDayInsights.astype(type_columns)

In [493]:
FTbAccountDayInsights.head()

Unnamed: 0,id,id_insight,id_account,value_last_day,last_end_time,value_actual_day,actual_end_time,extract_date,period_extraction,year,day
0,1784142544451618820240522impressionsday,8,17841425444516188,2,2024-05-21,0,2024-05-22,2024-05-22,5,2024,22
1,1784142544451618820240522reachday,12,17841425444516188,1,2024-05-21,0,2024-05-22,2024-05-22,5,2024,22
2,1784142544451618820240522follower_countday,6,17841425444516188,0,2024-05-21,0,2024-05-22,2024-05-22,5,2024,22
3,1784142544451618820240522email_contactsday,5,17841425444516188,0,2024-05-21,0,2024-05-22,2024-05-22,5,2024,22
4,1784142544451618820240522phone_call_clicksday,10,17841425444516188,0,2024-05-21,0,2024-05-22,2024-05-22,5,2024,22


Carregando os dados no banco

In [494]:
engine = connect_to_database('DW')
table_name = 'FTbAccountDayInsights'
key_column = 'id'

if not engine: 
    print('conexão fail')
    sys.exit()

with engine.connect() as connection:
    for i, row in FTbAccountDayInsights.iterrows():
        query = text(f"select count(*) from {table_name} WHERE {key_column} = :value")
        result = connection.execute(query, parameters=dict(value=row[key_column]))
        count = result.scalar()

        if count > 0:
            print(f'Registro com {key_column} = {row[key_column]} já existe.')
        else: 
            row.to_frame().T.to_sql(table_name, con=connection, if_exists='append', index=False)
            print(f"Registro com {key_column} = {row[key_column]} inserido com sucesso.")
    connection.commit()
    connection.close()

Registro com id = 1784142544451618820240522impressionsday já existe.
Registro com id = 1784142544451618820240522reachday já existe.
Registro com id = 1784142544451618820240522follower_countday já existe.
Registro com id = 1784142544451618820240522email_contactsday já existe.
Registro com id = 1784142544451618820240522phone_call_clicksday já existe.
Registro com id = 1784142544451618820240522text_message_clicksday já existe.
Registro com id = 1784142544451618820240522get_directions_clicksday já existe.
Registro com id = 1784142544451618820240522website_clicksday já existe.
Registro com id = 1784142544451618820240522profile_viewsday já existe.
Registro com id = 1784141753040061620240522impressionsday já existe.
Registro com id = 1784141753040061620240522reachday já existe.
Registro com id = 1784141753040061620240522follower_countday já existe.
Registro com id = 1784141753040061620240522email_contactsday já existe.
Registro com id = 1784141753040061620240522phone_call_clicksday já existe.

Tabela FTbAccountLifetimeInsights

Unindo com a dimensao de DTbAccountInsights para trazer o id

In [495]:
FTbAccountLifetimeInsights.head()

Unnamed: 0.1,Unnamed: 0,name,period,title,description,username,id_account,extract_date,period_extraction,year,day,id_tb_account,age_gender,value
0,0,audience_city,lifetime,Cidade do público,Cidades dos seguidores deste perfil,bodemeier.digital,17841425444516188,2024-05-22,5,2024,22,1784142544451618820240522audience_citylifetime,"value.São Paulo, São Paulo (state)",180.0
1,1,audience_city,lifetime,Cidade do público,Cidades dos seguidores deste perfil,gabgalani,17841417530400616,2024-05-22,5,2024,22,1784141753040061620240522audience_citylifetime,"value.São Paulo, São Paulo (state)",107.0
2,2,audience_city,lifetime,Cidade do público,Cidades dos seguidores deste perfil,bodemeier.digital,17841425444516188,2024-05-22,5,2024,22,1784142544451618820240522audience_citylifetime,"value.Rio de Janeiro, Rio de Janeiro (state)",24.0
3,3,audience_city,lifetime,Cidade do público,Cidades dos seguidores deste perfil,gabgalani,17841417530400616,2024-05-22,5,2024,22,1784141753040061620240522audience_citylifetime,"value.Rio de Janeiro, Rio de Janeiro (state)",7.0
4,4,audience_city,lifetime,Cidade do público,Cidades dos seguidores deste perfil,bodemeier.digital,17841425444516188,2024-05-22,5,2024,22,1784142544451618820240522audience_citylifetime,"value.Yuma, Arizona",20.0


In [496]:
FTbAccountLifetimeInsights = pd.merge(FTbAccountLifetimeInsights, DTbAccountInsights, on='name', how= 'inner')
FTbAccountLifetimeInsights.head(1)

Unnamed: 0.1,Unnamed: 0,name,period,title_x,description_x,username,id_account,extract_date,period_extraction,year,day,id_tb_account,age_gender,value,id_insight,frequencia,title_y,description_y
0,0,audience_city,lifetime,Cidade do público,Cidades dos seguidores deste perfil,bodemeier.digital,17841425444516188,2024-05-22,5,2024,22,1784142544451618820240522audience_citylifetime,"value.São Paulo, São Paulo (state)",180.0,1,lifetime,Cidade do público,Cidades dos seguidores deste perfil


Selecionando colunas

In [497]:
select_columns = [
    'id_tb_account',
    'id_account',
    'id_insight',
    'extract_date',
    'period_extraction',
    'year',
    'day',
    'name',
    'age_gender',
    'value'
]

FTbAccountLifetimeInsights = FTbAccountLifetimeInsights[select_columns]
FTbAccountLifetimeInsights.head()

Unnamed: 0,id_tb_account,id_account,id_insight,extract_date,period_extraction,year,day,name,age_gender,value
0,1784142544451618820240522audience_citylifetime,17841425444516188,1,2024-05-22,5,2024,22,audience_city,"value.São Paulo, São Paulo (state)",180.0
1,1784141753040061620240522audience_citylifetime,17841417530400616,1,2024-05-22,5,2024,22,audience_city,"value.São Paulo, São Paulo (state)",107.0
2,1784142544451618820240522audience_citylifetime,17841425444516188,1,2024-05-22,5,2024,22,audience_city,"value.Rio de Janeiro, Rio de Janeiro (state)",24.0
3,1784141753040061620240522audience_citylifetime,17841417530400616,1,2024-05-22,5,2024,22,audience_city,"value.Rio de Janeiro, Rio de Janeiro (state)",7.0
4,1784142544451618820240522audience_citylifetime,17841425444516188,1,2024-05-22,5,2024,22,audience_city,"value.Yuma, Arizona",20.0


Renomando colunas

In [498]:
rename_columns = {
    'id_tb_account': 'id',
    'name': 'definicao',
    'age_gender': 'value_descricao'
}

FTbAccountLifetimeInsights.rename(columns=rename_columns, inplace=True)
FTbAccountLifetimeInsights['value_descricao'] = FTbAccountLifetimeInsights['value_descricao'].str.replace('value.', '')
FTbAccountLifetimeInsights.head(5)


Unnamed: 0,id,id_account,id_insight,extract_date,period_extraction,year,day,definicao,value_descricao,value
0,1784142544451618820240522audience_citylifetime,17841425444516188,1,2024-05-22,5,2024,22,audience_city,"São Paulo, São Paulo (state)",180.0
1,1784141753040061620240522audience_citylifetime,17841417530400616,1,2024-05-22,5,2024,22,audience_city,"São Paulo, São Paulo (state)",107.0
2,1784142544451618820240522audience_citylifetime,17841425444516188,1,2024-05-22,5,2024,22,audience_city,"Rio de Janeiro, Rio de Janeiro (state)",24.0
3,1784141753040061620240522audience_citylifetime,17841417530400616,1,2024-05-22,5,2024,22,audience_city,"Rio de Janeiro, Rio de Janeiro (state)",7.0
4,1784142544451618820240522audience_citylifetime,17841425444516188,1,2024-05-22,5,2024,22,audience_city,"Yuma, Arizona",20.0


Tratando valores que não são numeros

In [499]:
def replace_non_numeric(value):
    try:
        float(value)  # Tenta converter o valor para float
        return value  # Se for numérico, mantenha o valor
    except ValueError:
        return 0  # Se não for numérico, substitua por 0

# Aplique a função à coluna 'value'
FTbAccountLifetimeInsights['value'] = FTbAccountLifetimeInsights['value'].apply(lambda x: replace_non_numeric(x))
FTbAccountLifetimeInsights['value'] = FTbAccountLifetimeInsights['value'].fillna(0)

Tipagem dos dados

In [500]:
FTbAccountLifetimeInsights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336 entries, 0 to 335
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 336 non-null    object 
 1   id_account         336 non-null    int64  
 2   id_insight         336 non-null    int64  
 3   extract_date       336 non-null    object 
 4   period_extraction  336 non-null    int64  
 5   year               336 non-null    int64  
 6   day                336 non-null    int64  
 7   definicao          336 non-null    object 
 8   value_descricao    336 non-null    object 
 9   value              336 non-null    float64
dtypes: float64(1), int64(5), object(4)
memory usage: 26.4+ KB


In [501]:
FTbAccountLifetimeInsights['extract_date'] = pd.to_datetime(FTbAccountLifetimeInsights['extract_date'])

type_columns = {
    'id_account': 'object',
    'period_extraction': 'object',
    'year': 'object',
    'day': 'object'
}

FTbAccountLifetimeInsights = FTbAccountLifetimeInsights.astype(type_columns)
FTbAccountLifetimeInsights.drop(columns='id', inplace=True)
FTbAccountLifetimeInsights['id'] = FTbAccountLifetimeInsights['id_account'].astype(str) + FTbAccountLifetimeInsights['id_insight'].astype(str) +  FTbAccountLifetimeInsights['extract_date'].astype(str) + FTbAccountLifetimeInsights['value_descricao'] 
FTbAccountLifetimeInsights.head()

Unnamed: 0,id_account,id_insight,extract_date,period_extraction,year,day,definicao,value_descricao,value,id
0,17841425444516188,1,2024-05-22,5,2024,22,audience_city,"São Paulo, São Paulo (state)",180.0,"1784142544451618812024-05-22São Paulo, São Pau..."
1,17841417530400616,1,2024-05-22,5,2024,22,audience_city,"São Paulo, São Paulo (state)",107.0,"1784141753040061612024-05-22São Paulo, São Pau..."
2,17841425444516188,1,2024-05-22,5,2024,22,audience_city,"Rio de Janeiro, Rio de Janeiro (state)",24.0,"1784142544451618812024-05-22Rio de Janeiro, Ri..."
3,17841417530400616,1,2024-05-22,5,2024,22,audience_city,"Rio de Janeiro, Rio de Janeiro (state)",7.0,"1784141753040061612024-05-22Rio de Janeiro, Ri..."
4,17841425444516188,1,2024-05-22,5,2024,22,audience_city,"Yuma, Arizona",20.0,"1784142544451618812024-05-22Yuma, Arizona"


In [503]:
colunas = ['id'] + [coluna for coluna in FTbAccountLifetimeInsights if coluna != 'id']
FTbAccountLifetimeInsights = FTbAccountLifetimeInsights[colunas]
FTbAccountLifetimeInsights.head()

Unnamed: 0,id,id_account,id_insight,extract_date,period_extraction,year,day,definicao,value_descricao,value
0,"1784142544451618812024-05-22São Paulo, São Pau...",17841425444516188,1,2024-05-22,5,2024,22,audience_city,"São Paulo, São Paulo (state)",180.0
1,"1784141753040061612024-05-22São Paulo, São Pau...",17841417530400616,1,2024-05-22,5,2024,22,audience_city,"São Paulo, São Paulo (state)",107.0
2,"1784142544451618812024-05-22Rio de Janeiro, Ri...",17841425444516188,1,2024-05-22,5,2024,22,audience_city,"Rio de Janeiro, Rio de Janeiro (state)",24.0
3,"1784141753040061612024-05-22Rio de Janeiro, Ri...",17841417530400616,1,2024-05-22,5,2024,22,audience_city,"Rio de Janeiro, Rio de Janeiro (state)",7.0
4,"1784142544451618812024-05-22Yuma, Arizona",17841425444516188,1,2024-05-22,5,2024,22,audience_city,"Yuma, Arizona",20.0


Carregando apenas linnhas que tem valores

In [506]:
FTbAccountLifetimeInsights = FTbAccountLifetimeInsights[FTbAccountLifetimeInsights['value'] != 0]

Carregando no banco

In [508]:
engine = connect_to_database('DW')
table_name = 'FTbAccountLifetimeInsights'
key_column = 'id'

if not engine: 
    print('conexão fail')
    sys.exit()

with engine.connect() as connection:
    for i, row in FTbAccountLifetimeInsights.iterrows():
        query = text(f"select count(*) from {table_name} WHERE {key_column} = :value")
        result = connection.execute(query, parameters=dict(value=row[key_column]))
        count = result.scalar()

        if count > 0:
            print(f'Registro com {key_column} = {row[key_column]} já existe.')
        else: 
            row.to_frame().T.to_sql(table_name, con=connection, if_exists='append', index=False)
            print(f"Registro com {key_column} = {row[key_column]} inserido com sucesso.")
    connection.commit()
    connection.close()

Registro com id = 1784142544451618812024-05-22São Paulo, São Paulo (state) inserido com sucesso.
Registro com id = 1784141753040061612024-05-22São Paulo, São Paulo (state) inserido com sucesso.
Registro com id = 1784142544451618812024-05-22Rio de Janeiro, Rio de Janeiro (state) inserido com sucesso.
Registro com id = 1784141753040061612024-05-22Rio de Janeiro, Rio de Janeiro (state) inserido com sucesso.
Registro com id = 1784142544451618812024-05-22Yuma, Arizona inserido com sucesso.
Registro com id = 1784142544451618812024-05-22Edinburg, Texas inserido com sucesso.
Registro com id = 1784142544451618812024-05-22Ulukısla, Niğde Province inserido com sucesso.
Registro com id = 1784142544451618812024-05-22Abington, Pennsylvania inserido com sucesso.
Registro com id = 1784142544451618812024-05-22Sioux Falls, South Dakota inserido com sucesso.
Registro com id = 1784142544451618812024-05-22Golpazarı, Bilecik Province inserido com sucesso.
Registro com id = 1784142544451618812024-05-22Alpu, 

## Tabelas de midias

In [511]:
file_path = r'C:\Users\gabri\OneDrive\Documentos\Projetos\Instagram_data\api\silver\TbMedias.csv'
dataset = pd.read_csv(file_path, sep='\t')
FTbMidias = dataset
dataset.head()

Unnamed: 0.1,Unnamed: 0,id_account,username,username.1,id_midia,comments_count,like_count,media_type,media_url,caption,timestamp,permalink,media_product_type,thumbnail_url,shortcode,extract_date,period,year,day,id_tb_midia
0,0,17841425444516188,bodemeier.digital,bodemeier.digital,18005273918369940,2,15,VIDEO,https://scontent.cdninstagram.com/o1/v/t16/f1/...,Você sabe o que é um briefing? 📝\n\n🎥 Assista ...,2024-02-20 21:19:22+00:00,https://www.instagram.com/reel/C3lW1XkOcfL/,REELS,https://scontent.cdninstagram.com/v/t51.29350-...,C3lW1XkOcfL,2024-05-22,5,2024,22,1800527391836994020240522
1,1,17841425444516188,bodemeier.digital,bodemeier.digital,17846370906157882,1,13,VIDEO,https://scontent.cdninstagram.com/o1/v/t16/f1/...,Uma grande dúvida que vocês me perguntam: SOCI...,2024-02-16 21:00:00+00:00,https://www.instagram.com/reel/C3bDRy3uifD/,REELS,https://scontent.cdninstagram.com/v/t51.29350-...,C3bDRy3uifD,2024-05-22,5,2024,22,1784637090615788220240522
2,2,17841425444516188,bodemeier.digital,bodemeier.digital,18002752616172668,0,5,VIDEO,https://scontent.cdninstagram.com/o1/v/t16/f1/...,“O maior instrumento da globalização cultural ...,2024-02-10 20:05:55+00:00,https://www.instagram.com/reel/C3Lf9wLOhpf/,REELS,https://scontent.cdninstagram.com/v/t51.29350-...,C3Lf9wLOhpf,2024-05-22,5,2024,22,1800275261617266820240522
3,3,17841425444516188,bodemeier.digital,bodemeier.digital,17971594208670852,0,7,VIDEO,,‼️ É isso que o Instagram prioriza!\n\nAcredit...,2024-02-09 18:00:00+00:00,https://www.instagram.com/reel/C3KvokquZdf/,REELS,https://scontent.cdninstagram.com/v/t51.29350-...,C3KvokquZdf,2024-05-22,5,2024,22,1797159420867085220240522
4,4,17841425444516188,bodemeier.digital,bodemeier.digital,18062694892504236,5,7,VIDEO,https://scontent.cdninstagram.com/o1/v/t16/f1/...,✨Aqui está: \n\nMas primeiro já me segue para ...,2024-02-09 13:00:00+00:00,https://www.instagram.com/reel/C3IK23sudbS/,REELS,https://scontent.cdninstagram.com/v/t51.29350-...,C3IK23sudbS,2024-05-22,5,2024,22,1806269489250423620240522


In [512]:
select_columns = [
    'id_midia',
    'id_account',
    'media_type',
    'media_url',
    'caption',
    'timestamp',
    'permalink',
    'media_product_type',
    'thumbnail_url',
    'shortcode'
]
DTbMidias = dataset[select_columns]
DTbMidias.head()

Unnamed: 0,id_midia,id_account,media_type,media_url,caption,timestamp,permalink,media_product_type,thumbnail_url,shortcode
0,18005273918369940,17841425444516188,VIDEO,https://scontent.cdninstagram.com/o1/v/t16/f1/...,Você sabe o que é um briefing? 📝\n\n🎥 Assista ...,2024-02-20 21:19:22+00:00,https://www.instagram.com/reel/C3lW1XkOcfL/,REELS,https://scontent.cdninstagram.com/v/t51.29350-...,C3lW1XkOcfL
1,17846370906157882,17841425444516188,VIDEO,https://scontent.cdninstagram.com/o1/v/t16/f1/...,Uma grande dúvida que vocês me perguntam: SOCI...,2024-02-16 21:00:00+00:00,https://www.instagram.com/reel/C3bDRy3uifD/,REELS,https://scontent.cdninstagram.com/v/t51.29350-...,C3bDRy3uifD
2,18002752616172668,17841425444516188,VIDEO,https://scontent.cdninstagram.com/o1/v/t16/f1/...,“O maior instrumento da globalização cultural ...,2024-02-10 20:05:55+00:00,https://www.instagram.com/reel/C3Lf9wLOhpf/,REELS,https://scontent.cdninstagram.com/v/t51.29350-...,C3Lf9wLOhpf
3,17971594208670852,17841425444516188,VIDEO,,‼️ É isso que o Instagram prioriza!\n\nAcredit...,2024-02-09 18:00:00+00:00,https://www.instagram.com/reel/C3KvokquZdf/,REELS,https://scontent.cdninstagram.com/v/t51.29350-...,C3KvokquZdf
4,18062694892504236,17841425444516188,VIDEO,https://scontent.cdninstagram.com/o1/v/t16/f1/...,✨Aqui está: \n\nMas primeiro já me segue para ...,2024-02-09 13:00:00+00:00,https://www.instagram.com/reel/C3IK23sudbS/,REELS,https://scontent.cdninstagram.com/v/t51.29350-...,C3IK23sudbS
