In [19]:
import psycopg2
import psycopg2.extras as extras
import config_warehouse as creds
import config_lake as creds_lake
import pandas as pd
import numpy as np
from psycopg2.extensions import register_adapter, AsIs

psycopg2.extensions.register_adapter(np.int64, psycopg2._psycopg.AsIs) #allow int64 for db import

In [20]:
# Show Tables

def Show_tables():
    try:
        # DB connection
        conn_string = "host="+ creds_lake.PGHOST +" port="+ "5432" +" dbname="+ creds_lake.PGDATABASE +" user=" + creds_lake.PGUSER +" password="+ creds_lake.PGPASSWORD
        conn = psycopg2.connect(conn_string)
        cur = conn.cursor()
        # Select data
        cur.execute("""SELECT table_name FROM information_schema.tables
        WHERE table_schema = 'public'""")
        records = cur.fetchall()
        #print(records)
        #print(type(records[0][0]))
        for item in records:
            print(item)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()

Show_tables()

('electricityproductionplants',)
('measuringpoints',)
('measuringpointscoordinates',)
('electriccars21',)
('ZIPCode',)
('chargingstations_static',)
('chargingstations_locations',)


In [24]:
def create_tables():
    """ create tables in the PostgreSQL database"""
    commands = (
        """
        CREATE TABLE IF NOT EXISTS Electric_cars_21 (
            ID SERIAL PRIMARY KEY,
            Vehicle_group VARCHAR(255) NOT NULL,
            Year_of_first_registration VARCHAR(255) NOT NULL,
            Fuel VARCHAR(255) NOT NULL,
            Count_2021 Int NOT NULL,
            Canton_abbreviation VARCHAR(255) NOT NULL,
            FOREIGN KEY (Canton_abbreviation) REFERENCES regions_and_cantons(canton_abbreviation))
        """,
        """
        CREATE TABLE IF NOT EXISTS Measuring_points (
            Nr_measuring_point Int PRIMARY KEY,
            Point_name VARCHAR(255) NOT NULL,
            Status VARCHAR(255) NOT NULL,
            Type_point VARCHAR(255) NOT NULL,
            Road VARCHAR(255) NOT NULL,
            Canton_abbreviation CHAR(2) NOT NULL,
            FOREIGN KEY (Canton_abbreviation) REFERENCES regions_and_cantons(canton_abbreviation))
        """,
        """
        CREATE TABLE IF NOT EXISTS Traffic_measurement_21 (
            Nr Int PRIMARY KEY,
            Nr_measuring_point Int NOT NULL,
            Annual_average Int,
            FOREIGN KEY (Nr_measuring_point) REFERENCES Measuring_points(Nr_measuring_point))
        """,
        """
        CREATE TABLE IF NOT EXISTS Measuring_points_coordinates (
            Nr Serial PRIMARY KEY,
            Nr_measuring_point Int NOT NULL,
            Coordinate_east Int NOT NULL,
            Coordinate_nord Int NOT NULL,
            FOREIGN KEY (Nr_measuring_point) REFERENCES Measuring_points(Nr_measuring_point))
        """,
        """
        CREATE TABLE IF NOT EXISTS Main_categories_electricity_production (
            ID_main_category Int PRIMARY KEY,
            Name_main_category VARCHAR(255) NOT NULL)
        """,
        """
        CREATE TABLE IF NOT EXISTS Sub_categories_electricity_production (
            ID_sub_category Int PRIMARY KEY,
            Name_sub_category VARCHAR(255) NOT NULL,
            ID_main_category Int NOT NULL,
            FOREIGN KEY (ID_main_category)
            REFERENCES Main_categories_electricity_production(ID_main_category))
        """,
        """
        CREATE TABLE IF NOT EXISTS Electricity_production_plants (
            xtf_id Int PRIMARY KEY,
            ID_postal_code Int NOT NULL,
            Municipality VARCHAR(255) NOT NULL,
            Canton_abbreviation CHAR(2) NOT NULL,
            ID_sub_category INT NOT NULL,
            Total_power FLOAT NOT NULL,
            Avg_monthly_production_kwh FLOAT,
            FOREIGN KEY (ID_postal_code) REFERENCES Postal_codes(ID_postal_code),
            FOREIGN KEY (Canton_abbreviation) REFERENCES regions_and_cantons(canton_abbreviation),
            FOREIGN KEY (ID_sub_category)
            REFERENCES Sub_categories_electricity_production(ID_sub_category))
         """
        #,
        # """
        # CREATE TABLE IF NOT EXISTS Operators (
        #     Operator_ID VARCHAR(255) PRIMARY KEY,
        #     Name_operator VARCHAR(255) NOT NULL)
        # """,
        # """
        # CREATE TABLE IF NOT EXISTS Charging_stations_static (
        #     Evse_ID VARCHAR(255) PRIMARY KEY,
        #     Charging_station_ID VARCHAR(255) NOT NULL,
        #     Operator_ID VARCHAR(255) NOT NULL,
        #     City VARCHAR(255) NOT NULL,
        #     Street VARCHAR(255) NOT NULL,
        #     Power INT NOT NULL,
        #     ID_postal_code Int NOT NULL,
        #     FOREIGN KEY (Operator_ID) REFERENCES Operators(Operator_ID),
        #     FOREIGN KEY (ID_postal_code) REFERENCES Postal_codes(ID_postal_code))
        # """,
        # """
        # CREATE TABLE IF NOT EXISTS Charging_stations_occupancy (
        #     Occupancy_ID SERIAL PRIMARY KEY,
        #     Evse_ID VARCHAR(255) NOT NULL,
        #     Daily_occupancy INT NOT NULL,
        #     Daily_kwH FLOAT NOT NULL,
        #     Daily_cars INT NOT NULL,
        #     Day_id DATE NOT NULL,
        #     FOREIGN KEY (Evse_ID) REFERENCES Charging_stations_static(Evse_ID))
        # """,
        # """
        # CREATE TABLE IF NOT EXISTS Chargin_stations_location (
        #     Location_ID Int PRIMARY KEY,
        #     Evse_ID VARCHAR(255) NOT NULL,
        #     Coordinate_east Int NOT NULL,
        #     Coordinate_nord Int NOT NULL,
        #     FOREIGN KEY (Evse_ID) REFERENCES Charging_stations_static(Evse_ID))
        # """
    )

    #  PRIMARY KEY (Postal_code)

    try:
        # Set up a connection to the postgres server.
        conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.PGUSER \
                  +" password="+ creds.PGPASSWORD

        conn = psycopg2.connect(conn_string)
        print("Connected!")

        # Create a cursor object
        cursor = conn.cursor()

        # Create tables
        for command in commands:
            cursor.execute(command)
        print('Tables created')

        # Import data from dataframe
        #execute_values(conn, fetch_data_plz(), 'Postal_codes')

        # close communication with the PostgreSQL database server
        cursor.close()
        # commit the changes
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()

create_tables()

Connected!
Tables created


In [22]:
# Inserting dataframe into database
def execute_values(conn, df, table):
    # Set up a connection to the warehouse
    conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.PGUSER \
              +" password="+ creds.PGPASSWORD

    conn = psycopg2.connect(conn_string)
    print("Connected!")

    # Create a cursor object
    cursor = conn.cursor()

    # Set up data
    tuples = [tuple(x) for x in df.to_numpy()]
    cols = ','.join(list(df.columns))
    # SQL query to execute
    query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
    cur = conn.cursor()
    try:
        extras.execute_values(cur, query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cur.close()
        return
    print("the dataframe is inserted")
    cur.close()

In [25]:
# Fetch all table's data

tables_lake = ['Measuring_points_coordinates', 'postal_codes', 'charging_stations_occupancy',
          'electric_cars_21', 'electricity_production_plants', 'measuring_points', 'charging_stations_static', 'charging_stations_locations'] #tables lake with warehouse name's!

def fetch_all_data():
    try:
        # DB connection
        conn_string = "host="+ creds_lake.PGHOST +" port="+ "5432" +" dbname="+ creds_lake.PGDATABASE +" user=" + creds_lake.PGUSER +" password="+ creds_lake.PGPASSWORD
        conn = psycopg2.connect(conn_string)
        cur = conn.cursor()

        # Select table names
        cur.execute("""SELECT table_name FROM information_schema.tables
        WHERE table_schema = 'public'""")
        records = cur.fetchall()
        records_list = []
        for item in records:
            item = str(item)
            records_list.append(item[2:-3])
        print(records_list)

        # Select data
        for table_name in range(0, len(records_list)):
            if records_list[table_name] == 'ZIPCode' or records_list[table_name] == 'RealTimeStatus' or \
                records_list[table_name] == 'measuringpoints':
                continue
            # if records_list[table_name] != 'electricityproductionplants':
            #     continue
            else:
                cur.execute((""" SELECT * FROM {}""").format(records_list[table_name])) # passing string without quotation marks
                df_records = cur.fetchall()
                df = pd.DataFrame(df_records)
                #print(df)

                if records_list[table_name] == 'measuringpointscoordinates':
                    df_selected = df.copy()
                    #Name columns
                    df_selected.columns = ['Nr_measuring_point','Status_BGDI','Point_name',
                                           'Canton_abbreviation','Road', 'Coordinate_East', 'Coordinate_Nord', 'Status', 'Type_Point', 'Number_of_Lanes']
                    #Filter needed rows
                    df_selected = df_selected.loc[df_selected['Status'] == 'in Betrieb']
                    df_selected = df_selected.loc[df_selected['Type_Point'] != 'nur Online']

                    #Dataframe Measuring_points_coordinates set up
                    df_coordinates = df_selected[['Nr_measuring_point', 'Coordinate_East',
                                                  'Coordinate_Nord',]]

                    #Dataframe Measuring_points; Drop not needed columns
                    df_selected.drop(['Status_BGDI','Coordinate_East','Coordinate_Nord','Number_of_Lanes'], axis=1, inplace=True)

                    # import to warehouse
                    execute_values(conn, df_selected, 'Measuring_points')
                    execute_values(conn, df_coordinates, 'Measuring_points_coordinates')
                    #print(df_selected)

                elif records_list[table_name] == 'realtimestatus':
                    continue

                elif records_list[table_name] == 'electriccars21':
                    #Name columns
                    df.columns = ['id','Canton_abbreviation','Vehicle_group',
                                           'Year_of_first_registration','Fuel', 'Count_2021']
                    #Filter needed rows
                    df_selected = df[(df['Canton_abbreviation'] != 'Switzerland') &
                                     (df['Canton_abbreviation'] != 'Confederation')]

                    #Map Canton names with Canton Abbreviations
                    dict_cantons = {'Zürich' : 'ZH', 'Bern' : 'BE', 'Luzern' : 'LU', 'Uri' : 'UR',
                                    'Schwyz' : 'SZ', 'Obwalden' : 'OW', 'Nidwalden' : 'NW',
                                    'Glarus' : 'GL', 'Zug' : 'ZG', 'Fribourg' : 'FR',
                                    'Solothurn' : 'SO', 'Basel-Stadt' : 'BS', 'Basel-Landschaft' : 'BL',
                                    'Schaffhausen' : 'SH', 'Appenzell-Ausserrhoden' : 'AR',
                                    'Appenzell-Innerrhoden' : 'AI', 'Sankt Gallen' : 'SG',
                                    'Graubünden' : 'GR', 'Aargau' : 'AG', 'Thurgau' : 'TG',
                                    'Ticino' : 'TI', 'Vaud' : 'VD', 'Valais' : 'VS', 'Neuchâtel' : 'NE',
                                    'Genève' : 'GE', 'Jura' : 'JU'}

                    df_selected['Canton_abbreviation'] = df_selected['Canton_abbreviation'].map(dict_cantons)
                    df_selected['Canton_abbreviation'] = df_selected['Canton_abbreviation'].astype(str)
                    #print(df_selected['Canton_abbreviation'])
                    #print(df_selected.info())

                    # import to warehouse
                    execute_values(conn, df_selected, 'Electric_cars_21')

                elif records_list[table_name] == 'electricityproductionplants':
                    # DFs main/sub categories
                    main_cat_dict = {'ID_main_category' : [1,2,3,4], 'Name_main_category' : ['Fossil fuel',
                                                                                             'Hydroelectric power',
                                                                                             'Nuclear energy',
                                                                                             'Other renewable energies']}
                    df_selected_maincat = pd.DataFrame.from_dict(main_cat_dict)
                    sub_cat_dict = {'ID_sub_category' : [1,2,3,4,5,6,7,8], 'Name_sub_category' : ['Biomass',
                                                                                                  'Crude oil',
                                                                                                  'Hydroelectric power',
                                                                                                  'Natural gas',
                                                                                                  'Nuclear energy',
                                                                                                  'Photovoltaic',
                                                                                                  'Waste',
                                                                                                  'Wind energy'],
                                    'ID_main_category' : [4,1,2,1,3,4,4,4]}
                    df_selected_subcat = pd.DataFrame.from_dict(sub_cat_dict)

                    # DF Electricity production plants
                    # Rename column names / select needed columns

                    df.columns = ['xtf_id', 'id_postal_code', 'Municipality', 'Canton_abbreviation','BoO', 'MC',
                                  'ID_sub_category', 'IP','total_power']


                    df_selected = df[['xtf_id', 'id_postal_code', 'Municipality', 'Canton_abbreviation',
                                  'ID_sub_category', 'total_power']]

                    # Map subcategories
                    dict_subcat = {'Biomass' : 1,                                                                                                'Crude oil' : 2,
                                   'Hydroelectric power' : 3,
                                   'Natural gas' : 4,
                                   'Nuclear energy' : 5,
                                   'Photovoltaic' : 6,
                                   'Waste' : 7,
                                   'Wind energy' : 8}
                    df_selected['ID_sub_category'] = df_selected['ID_sub_category'].map(dict_subcat)

                    # Map postal code
                    try:

                        # Set up a connection to the warehouse
                        conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.PGUSER \
                                  +" password="+ creds.PGPASSWORD
                        conn = psycopg2.connect(conn_string)
                        print("Warehouse connected!")

                        # Create a cursor object
                        cursor = conn.cursor()

                        # # Catch Postal Codes
                        cursor.execute((""" SELECT * FROM Postal_codes"""))
                        df_records = cursor.fetchall()
                        df_postal_codes = pd.DataFrame(df_records, dtype='int')


                        # Append missing postal codes to relation postal_codes in warehouse
                        missing_plz_df = df[['id_postal_code','Canton_abbreviation']][df['id_postal_code']
                                                                                          .isin(df_postal_codes[1]) == False]
                        # correct column names
                        missing_plz_df.rename(columns= {'id_postal_code' : 'postal_code'}, inplace=True)
                        print(missing_plz_df)
                        # Add primary key column
                        missing_plz_df['id_postal_code'] = pd.Series(dtype='int8')
                        missing_plz_df['id_postal_code'] = [id for id in range(3489,(3489 + len(missing_plz_df)))]
                        print(missing_plz_df)
                        execute_values(conn, missing_plz_df, 'Postal_codes')

                        # Catch Postal Codes AGAIN due to amendment
                        cursor.execute((""" SELECT * FROM Postal_codes"""))
                        df_records = cursor.fetchall()
                        df_postal_codes = pd.DataFrame(df_records, dtype='int')
                        print(df_postal_codes)

                        # Mapping postal codes for electricity_production_plants
                        dict_postal_codes = df_postal_codes.set_index([1]).to_dict()[0]
                        df_selected.loc['id_postal_code'] = df_selected['id_postal_code'].map(dict_postal_codes)
                        #Null values
                        print(df_selected[df_selected['id_postal_code'].isnull() == True])
                        print('before:', len(df_selected))
                        df_selected.dropna(inplace=True)
                         #(id_postal_code)=(4147)
                        print('after:', len(df_selected))


                        # Add column Avg_monthly_production (from kw to kwh considering average occupancy)
                        df_selected.loc[:,'Avg_monthly_production_kwh']  = np.nan
                        for row in range(0, len(df_selected)):
                            if df_selected.loc[row, 'ID_sub_category'] == 3: #Hydro
                                df_selected.loc[row, 'Avg_monthly_production_kwh']\
                                    = (df_selected.loc[row, 'total_power'])*24*30*0.2922
                            elif df_selected.loc[row, 'ID_sub_category']  == 6: #Photovoltaic
                                df_selected.loc[row, 'Avg_monthly_production_kwh'] \
                                    = (df_selected.loc[row, 'total_power'])*24*30*0.1005
                            elif df_selected.loc[row, 'ID_sub_category']  == 1: #Biomass
                                df_selected.loc[row, 'Avg_monthly_production_kwh'] \
                                    = (df_selected.loc[row, 'total_power'])*24*30*0.2692
                            elif df_selected.loc[row, 'ID_sub_category']  == 8: #Wind energy
                                df_selected.loc[row, 'Avg_monthly_production_kwh'] \
                                    = (df_selected.loc[row, 'total_power'])*24*30*0.1875
                            else:
                                continue

                    except psycopg2.Error as e:
                        print (e)

                    # import to warehouse
                    execute_values(conn, df_selected_maincat, 'Main_categories_electricity_production')
                    execute_values(conn, df_selected_subcat, 'Sub_categories_electricity_production')
                    execute_values(conn, df_selected, 'Electricity_production_plants')

                elif records_list[table_name] == 'measuringpoints':
                    #Keep needed columns by index
                    df_selected = df.iloc[:,[0,1,18]]

                    #Name columns
                    df_selected.columns = ['Nr', 'Nr_measuring_point','Annual_average']

                    # import to warehouse
                    execute_values(conn, df_selected, 'Traffic_measurement_21')

                elif records_list[table_name] == 'chargingstations_static':
                    continue

                elif records_list[table_name] == 'chargingstations_locations':
                    continue

                else:
                    continue

    except (Exception, psycopg2.DatabaseError) as error:
        print(error)

    finally:
        if conn is not None:
            conn.close()

fetch_all_data()

['electricityproductionplants', 'measuringpoints', 'measuringpointscoordinates', 'electriccars21', 'ZIPCode', 'chargingstations_static', 'chargingstations_locations']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['ID_sub_category'] = df_selected['ID_sub_category'].map(dict_subcat)


Warehouse connected!
Empty DataFrame
Columns: [postal_code, Canton_abbreviation]
Index: []
Empty DataFrame
Columns: [postal_code, Canton_abbreviation, id_postal_code]
Index: []
Connected!
the dataframe is inserted
         0     1   2
0        1  5303  AG
1        2  6452  UR
2        3  8301  ZH
3        4  8707  ZH
4        5  6928  TI
...    ...   ...  ..
3519  3521  8079  ZH
3520  3522  8238  SH
3521  3523  7135  GR
3522  3524  4008  BS
3523  3525  6342  ZG

[3524 rows x 3 columns]
                xtf_id  id_postal_code Municipality Canton_abbreviation  \
id_postal_code     NaN             NaN          NaN                 NaN   

                ID_sub_category  total_power  
id_postal_code              NaN          NaN  
before: 141189
after: 141188
          xtf_id  id_postal_code Municipality Canton_abbreviation  \
0        14727.0          4147.0     Aesch BL                  BL   
1230     10930.0          4147.0        Aesch                  BL   
1898      9959.0          41

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/use

Connected!
the dataframe is inserted
Connected!
the dataframe is inserted
Connected!
Error: insert or update on table "electricity_production_plants" violates foreign key constraint "electricity_production_plants_id_postal_code_fkey"
DETAIL:  Key (id_postal_code)=(4147) is not present in table "postal_codes".

Connected!
the dataframe is inserted
Connected!
the dataframe is inserted


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Canton_abbreviation'] = df_selected['Canton_abbreviation'].map(dict_cantons)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Canton_abbreviation'] = df_selected['Canton_abbreviation'].astype(str)


Connected!
the dataframe is inserted


In [26]:
# Set up a connection to the warehouse
conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.PGUSER \
          +" password="+ creds.PGPASSWORD

conn = psycopg2.connect(conn_string)
print("Connected!")

# Create a cursor object
cursor = conn.cursor()

cursor.execute((""" SELECT * FROM Postal_codes"""))
df_records = cursor.fetchall()
df_postal_codes = pd.DataFrame(df_records)
df_postal_codes
#df_postal_codes.info()
conn.close()

Connected!


In [27]:
# Get data from lake
# DB connection
conn_string = "host="+ creds_lake.PGHOST +" port="+ "5432" +" dbname="+ creds_lake.PGDATABASE +" user=" + creds_lake.PGUSER +" password="+ creds_lake.PGPASSWORD
conn = psycopg2.connect(conn_string)
cur = conn.cursor()

# Select table names
cur.execute("""SELECT * FROM electricityproductionplants""")
records = cur.fetchall()
df = pd.DataFrame(records)

# print(df)

df.columns = ['xtf_id', 'id_postal_code', 'Municipality', 'Canton_abbreviation','BoO', 'MC','ID_sub_category', 'IP','total_power']


df_selected = df.loc[:,['xtf_id', 'id_postal_code', 'Municipality', 'Canton_abbreviation','ID_sub_category', 'total_power']]

# Map subcategories
dict_subcat = {'Biomass' : 1,
               'Crude oil' : 2,
               'Hydroelectric power' : 3,
               'Natural gas' : 4,
               'Nuclear energy' : 5,
               'Photovoltaic' : 6,
               'Waste' : 7,
               'Wind energy' : 8}
df_selected.loc[:,['ID_sub_category']] = df_selected['ID_sub_category'].map(dict_subcat)

print(df_selected)

        xtf_id  id_postal_code Municipality Canton_abbreviation  \
0        14727            4147     Aesch BL                  BL   
1        14728            5082      Kaisten                  AG   
2        10164            3114    Wichtrach                  BE   
3        10794            3753          Oey                  BE   
4         9476            8340       Hinwil                  ZH   
...        ...             ...          ...                 ...   
141183   50526            6315    Oberägeri                  ZG   
141184   50527            6300          Zug                  ZG   
141185   51006            3638  Blumenstein                  BE   
141186   15984            4125       Riehen                  BS   
141187   10231            1862   Les Mosses                  VD   

        ID_sub_category  total_power  
0                     6        18.81  
1                     6         5.80  
2                     6         3.00  
3                     6         8.40  


In [33]:
# missing postal codes for import
missing_plz_df = df[['id_postal_code','Canton_abbreviation']][df['id_postal_code'].isin(df_postal_codes[1]) == False]
# correct column names
missing_plz_df.rename(columns= {'id_postal_code' : 'postal_code'}, inplace=True)
# Add primary key column
missing_plz_df['id_postal_code'] = ''
missing_plz_df['id_postal_code'] = [id for id in range((max(df_postal_codes[0])+1),
                                                        ((max(df_postal_codes[0])+1) + len(missing_plz_df)))]

print(missing_plz_df)
execute_values(conn, missing_plz_df, 'Postal_codes')

Empty DataFrame
Columns: [postal_code, Canton_abbreviation, id_postal_code]
Index: []
Connected!
the dataframe is inserted


In [34]:
#print(df_postal_codes.info())
# Mapping
dict_postal_codes = df_postal_codes.set_index([1]).to_dict()[0]
#print(df_postal_codes)
#NA values
print(df_selected['id_postal_code'][df_selected['id_postal_code'].isna() == True])

0        NaN
1        NaN
3        NaN
4        NaN
5        NaN
          ..
141182   NaN
141183   NaN
141184   NaN
141185   NaN
141186   NaN
Name: id_postal_code, Length: 87923, dtype: float64


In [35]:
#Map
df_selected.loc[:,['id_postal_code']] = df_selected['id_postal_code'].map(dict_postal_codes)
#NA values
print(df_selected[df_selected['id_postal_code'].isna() == True])

        xtf_id  id_postal_code Municipality Canton_abbreviation  \
0        14727             NaN     Aesch BL                  BL   
1        14728             NaN      Kaisten                  AG   
3        10794             NaN          Oey                  BE   
4         9476             NaN       Hinwil                  ZH   
5        14729             NaN     Küsnacht                  ZH   
...        ...             ...          ...                 ...   
141182   48373             NaN   Winterthur                  ZH   
141183   50526             NaN    Oberägeri                  ZG   
141184   50527             NaN          Zug                  ZG   
141185   51006             NaN  Blumenstein                  BE   
141186   15984             NaN       Riehen                  BS   

        ID_sub_category  total_power  
0                     6        18.81  
1                     6         5.80  
3                     6         8.40  
4                     6         4.80  


In [15]:
id_list = list(df_selected['xtf_id'][df_selected['id_postal_code'].isna() == True]) # list with id's of null values
print(id_list)
print(df[df['xtf_id'].isin(id_list)])

[]
Empty DataFrame
Columns: [xtf_id, id_postal_code, Municipality, Canton_abbreviation, BoO, MC, ID_sub_category, IP, total_power]
Index: []


In [16]:
plz_list = list(df['id_postal_code'][df['xtf_id'].isin(id_list)])
plz_list

[]

In [36]:
#print(df_selected['id_postal_code'].isnull())
print('before:', len(df_selected))
df_selected.loc[:,['id_postal_code']].dropna(axis = 0)
 #(id_postal_code)=(4147)
print('after:', len(df_selected))
#df_selected.loc['id_postal_code'] = df_selected['id_postal_code'].astype('int8')

before: 141188
after: 141188


In [37]:
print(df_selected['id_postal_code'].astype(int))

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [38]:
print(dict_postal_codes) # dict seems fine

{5303: 1, 6452: 662, 8301: 3, 8707: 4, 6928: 5, 3045: 6, 4106: 7, 1215: 8, 7276: 9, 4144: 10, 5412: 11, 5210: 12, 8331: 13, 5504: 14, 9201: 15, 1288: 16, 8081: 17, 5317: 18, 8752: 19, 6657: 20, 1782: 21, 8045: 22, 1077: 23, 3672: 24, 4411: 25, 6670: 26, 7212: 27, 4613: 28, 6908: 29, 7455: 30, 3506: 31, 8842: 32, 4055: 33, 6513: 34, 8523: 35, 9471: 36, 3930: 37, 3633: 38, 7106: 39, 3602: 40, 6145: 41, 7182: 42, 8886: 43, 1899: 44, 6383: 45, 7168: 46, 4244: 47, 9475: 48, 7202: 49, 3014: 50, 5706: 51, 5636: 52, 1055: 53, 7180: 54, 1117: 55, 1607: 56, 8637: 57, 1028: 58, 8713: 59, 6804: 60, 6780: 61, 6402: 62, 5644: 63, 3036: 64, 1580: 65, 5242: 66, 4314: 67, 1996: 68, 6052: 69, 8475: 70, 8124: 71, 4304: 72, 6577: 73, 6475: 74, 1589: 75, 3792: 76, 4229: 77, 1945: 78, 1627: 79, 3600: 80, 6454: 81, 7451: 82, 5326: 83, 1377: 84, 4584: 85, 6514: 86, 9622: 87, 3904: 88, 3801: 89, 8612: 90, 8225: 91, 1724: 92, 7502: 93, 7460: 94, 4652: 95, 3071: 96, 1673: 97, 5314: 98, 8314: 99, 4463: 100, 4003:

In [81]:
print(df_selected['id_postal_code'][df_selected['id_postal_code'] <= 100]) # check for low index numbers

12        61.0
20        61.0
45        61.0
47        61.0
108       16.0
          ... 
140887    76.0
140890    76.0
140910    22.0
141066    54.0
141114    46.0
Name: id_postal_code, Length: 4234, dtype: float64


In [82]:
print(df_selected['id_postal_code'][df_selected['id_postal_code'].isna() == True])

2556     NaN
3671     NaN
12590    NaN
20011    NaN
22558    NaN
27376    NaN
40350    NaN
46347    NaN
46502    NaN
56149    NaN
64240    NaN
68431    NaN
71347    NaN
80699    NaN
84389    NaN
95735    NaN
97374    NaN
97738    NaN
102310   NaN
107549   NaN
111903   NaN
113740   NaN
114456   NaN
116358   NaN
117230   NaN
124611   NaN
127493   NaN
131450   NaN
135767   NaN
136608   NaN
136652   NaN
136875   NaN
138181   NaN
138248   NaN
139056   NaN
140529   NaN
140968   NaN
Name: id_postal_code, dtype: float64


In [59]:
df_selected['id_postal_code']

0         2071.0
1          349.0
2          513.0
3         1551.0
4         1824.0
           ...  
141183     328.0
141184    3139.0
141185    2424.0
141186     452.0
141187    3360.0
Name: id_postal_code, Length: 141188, dtype: float64

In [4]:
# Access specific row in column
row = 4
df_selected.loc[[row], ['ID_sub_category']]

Unnamed: 0,ID_sub_category
4,6
