In [1]:
#-----------------------------------------------------
# SQL_jupyterlab to create and populate databases
#
# next update :
# => add function when selecting df.index
# => put all functions in a file.py
# => put departement from commune in another table 
#-----------------------------------------------------

import pandas as pd
import numpy as np

# load data and drop duplicates
df = pd.read_excel("immobilier.xlsx")
df = df.drop_duplicates()

# formatting columns
df.columns = [i.replace(' ', '_').lower() for i in df.columns]

# drop b/t/q and no_volume because too many NaN
df = df.drop(["b/t/q", "no_volume"], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34166 entries, 0 to 34165
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   date_mutation              34166 non-null  datetime64[ns]
 1   nature_mutation            34166 non-null  object        
 2   valeur_fonciere            34148 non-null  float64       
 3   no_voie                    34033 non-null  float64       
 4   code_type_de_voie          34166 non-null  int64         
 5   type_de_voie               33226 non-null  object        
 6   code_voie                  34166 non-null  object        
 7   voie                       34166 non-null  object        
 8   code_id_commune            34166 non-null  int64         
 9   code_postal                34165 non-null  float64       
 10  commune                    34166 non-null  object        
 11  code_departement           34166 non-null  object        
 12  code

  warn(msg)


In [2]:
# select manually columns by index required in table sell
sell_columns = list(df.columns[:3])

# select manually columns by index required in table departement
street_columns = list(df.columns[3:8])

# select manually columns by index required in table departement
commune_columns = list(df.columns[8:13])

# select manually columns by index required in table features
features_columns = list(df.columns[13:])


# define dict of table:columns
table_dict = {"sell" : sell_columns,
              "features" : features_columns,
              "street" : street_columns,
              "commune" : commune_columns
             }

for key, item in table_dict.items():
    print(key, ":")
    print(item, "\n")

# table w referenced_by u, v 
# => {w:[u, v]}
ref_dict = {"sell" : ["features", "street", "commune"]
           }

sell :
['date_mutation', 'nature_mutation', 'valeur_fonciere'] 

features :
['section', 'no_plan', '1er_lot', 'surface_carrez_du_1er_lot', 'nombre_de_lots', 'code_type_local', 'type_local', 'surface_reelle_bati', 'nombre_pieces_principales'] 

street :
['no_voie', 'code_type_de_voie', 'type_de_voie', 'code_voie', 'voie'] 

commune :
['code_id_commune', 'code_postal', 'commune', 'code_departement', 'code_commune'] 



In [3]:
from csv_tools import *

# tables_dict to save to tables.csv
def define_table_list(table_dict):
    to_csv = []
    
    # for each table
    for key, values in table_dict.items():
        # for each table’s attribut
        for value in values:
            template = {"table_name": key, 
                        "column_name": value,
                        "data_type": df[value].dtypes
                       }
            # add template to list
            to_csv.append(template)
    return to_csv

# save table_dict to csv
table_to_csv = define_table_list(table_dict)
dict_to_csv(table_to_csv, "tables.csv")

# save ref_dict to csv
ref_to_csv = define_reference_list(ref_dict)
dict_to_csv(ref_to_csv, "references.csv")

'Dictionary saved to csv successfully'

In [4]:
# script to create all necessary tables
%run tables_generator.py

metadata_obj is online
Tables creation successful
Columns added to Tables successfully 



In [5]:
from sqlalchemy import create_engine

# connect to immobilier_db
engine = create_engine('sqlite:///immobilier.db', echo = True)
connection = engine.connect()

# create all tables in metadata_obj
metadata_obj.create_all(engine)

2022-10-25 13:51:54,179 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-10-25 13:51:54,180 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("sell")
2022-10-25 13:51:54,181 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-10-25 13:51:54,181 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("sell")
2022-10-25 13:51:54,182 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-10-25 13:51:54,182 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("features")
2022-10-25 13:51:54,182 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-10-25 13:51:54,183 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("features")
2022-10-25 13:51:54,183 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-10-25 13:51:54,183 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("commune")
2022-10-25 13:51:54,183 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-10-25 13:51:54,184 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("commune")
2022-10-25 13:51:54,184 INFO sqlalchemy.engine.Engine [raw sql] ()
202

In [6]:
""" functions to add fkeys to table dataframe
    in order to insert it in database"""    

    
# define reference dataframe with foreign key as index    
def define_reference_df(df, table_dict, reference):
    # get ref_columns
    ref_columns = table_dict[reference]
    
    # get first element of ref_columns
    col = ref_columns[0]
    
    # groupby dataframe
    ref_df = eval("pd.DataFrame(df.groupby(ref_columns).{}.count())".format(col))
    ref_df.columns=['v']
    ref_df = ref_df.reset_index().drop('v', axis=1)
    ref_df.index.name = reference + "_id"
    
    return ref_df

def add_foreign_key(df, table_dict, table, reference):
    
    # we get the reference dataframe
    ref_df = define_reference_df(df, table_dict, reference)  
    
    # we select columns to only add foreign_key to df 
    merge_columns = table_dict[table] + [reference+"_id"]
    
    # we merge both dataframe to add foreign key column
    df = df.merge(ref_df.reset_index(), how="left")
    
    return df, ref_df

# populate table from sub_dataframe
def populate_table(df, table):
    
    # select Table in metadata_obj
    sql_table = metadata_obj.tables[table]
    
    # dataframe to dict method
    ins = df.to_dict(orient="records")
    
    # execute INSERT sql statement
    connection.execute(sql_table.insert(), ins)
    
    return "Data add to table "+str(table)+" sucessfully"

# main program   
def main(df, table_dict, references_list):
    
    # get set of tables 
    table_set = set(dictionary["table_name"] for dictionary in references_list)
   
    
    # iterate for tables
    for table in table_set:

        # get list of all references for table
        ref_list = [dic["referenced_by"] for dic in references_list if dic["table_name"] == table]
    
        for reference in ref_list:
            
            # add corresponding foreign key to table
            #  return df and reference dataframe
            df, ref_df = add_foreign_key(df, table_dict, table, reference)
    
            # add reference table to database
            populate_table(ref_df, reference)
            
        # add table to database
        populate_table(df, table)

        
# we load all references and we execute dataframe segmentation
references_list = csv_to_dict("references.csv")
main(df, table_dict, references_list)

2022-10-25 13:51:54,516 INFO sqlalchemy.engine.Engine INSERT INTO features (section, no_plan, "1er_lot", surface_carrez_du_1er_lot, nombre_de_lots, code_type_local, type_local, surface_reelle_bati, nombre_pieces_principales) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
2022-10-25 13:51:54,517 INFO sqlalchemy.engine.Engine [generated in 0.13174s] (('A', 1, 13, 40.94, 2, 2, 'Appartement', 40, 3), ('A', 2, 18, 23.77, 1, 2, 'Appartement', 24, 1), ('A', 3, 20, 27.24, 1, 2, 'Appartement', 24, 1), ('A', 3, 25, 38.54, 1, 2, 'Appartement', 40, 2), ('A', 4, 18, 83.5, 1, 2, 'Appartement', 84, 3), ('A', 5, 297, 66.72, 1, 2, 'Appartement', 67, 3), ('A', 8, 5, 256.55, 1, 2, 'Appartement', 265, 7), ('A', 8, 10, 26.53, 2, 2, 'Appartement', 28, 2)  ... displaying 10 of 34156 total bound parameter sets ...  ('ZY', 40, 1, 37.14, 1, 2, 'Appartement', 37, 1), ('ZY', 177, 47, 43.2, 2, 2, 'Appartement', 43, 2))
2022-10-25 13:51:54,573 INFO sqlalchemy.engine.Engine COMMIT
2022-10-25 13:51:54,767 INFO sqlalchemy.engine.

In [7]:
# finally we close database connection
connection.close()
engine.dispose()