In [1]:
import mysql
from mysql.connector import MySQLConnection, Error
from python_mysql_dbconfig import read_db_config # Read database configuration file and return a dictionary database object

In [6]:
def create_msql_database(name_database):
    """Function create MYSQL database to save into db config file"""
    mydb = mysql.connector.connect(host='localhost',
                                      user='root',
                                      password='root')
    
    mycursor = mydb.cursor()
    mycursor.execute("SHOW DATABASES")
    databases = [x[0] for x in mycursor]
    if name_database not in databases:
        mycursor.execute(f"CREATE DATABASE IF NOT EXISTS {name_database}")
    else:
        print("Database exists")
    mycursor.close()
    
def create_msql_table(query_create):
    """
    Execute query to create table
    :query_create:: to create table
    """

    conn = None
    try:
        db_config = read_db_config() #Using configuration file to connect to database
        print('Connecting to MYSQL database...')
        conn = MySQLConnection(**db_config)
        print('Connected to MYSQL database')
        cursor = conn.cursor()
        print('Creating Mysql table...')
        cursor.execute(query_create)
        print('Table created')
    except Error as e:
        print("Error: ", e)

    finally:
        cursor.close()
        conn.close()

def insert_msql_table(file_dir, table_name):
    """execute query to insert tsv file into table
    :file_dir:: file path
    :table_name:: name of table to be insterted
    :skip_header:: use header 
    """

    conn = None
    try:
        db_config = read_db_config() #Using configuration file to connect to database
        print('Connecting to MYSQL database...')
        conn = MySQLConnection(**db_config)
        print('Connected to MYSQL database')

        cursor = conn.cursor()
        print('creating MYSQL table...')
        load_data_query = f"""LOAD DATA INFILE '{file_dir}' INTO TABLE {table_name}
                                FIELDS TERMINATED BY '\t'
                                LINES TERMINATED BY '\n'
                                IGNORE 1 ROWS
                                ON DUPLICATE KEY UPDATE
                                    net_name = VALUES(net_name),
                                    cidr = VALUES(cidr),
                                    net_range = VALUES(net_range),
                                    net_type = VALUES(net_type),
                                    asn = VALUES(asn),
                                    org_cust_id = VALUES(org_cust_id),
                                    updated = VALUES(updated)
                                (@net_id, @net_name, @cidr, @net_range, @net_type, @asn, @org_cust_id, @updated)
                                SET
                                    net_id = NULLIF(@net_id, ' '),
                                    net_name = NULLIF(@net_name, ' '),
                                    cidr = NULLIF(@cidr, ' '),
                                    net_range = NULLIF(@net_range, ' '),
                                    net_type = NULLIF(@net_type, ' '),
                                    asn = NULLIF(@asn, ' '),
                                    org_cust_id = NULLIF(@org_cust_id, ' '),
                                    updated = NULLIF(@updated, ' ')"""

#(@net_id, @net_name, @cidr, @net_range, @net_type, @asn, @org_cust_id, @updated)\          
        cursor.execute(load_data_query)

        conn.commit()
        print('MYSQL table created')

    except Error as e:
        print("Error: ", e)

    finally:
        cursor.close()
        conn.close()



In [7]:

# Create the table (if it doesn't exist)
network_table = "CREATE TABLE network (net_id VARCHAR(255) UNIQUE, \
                                            net_name VARCHAR(255), cidr VARCHAR(255) UNIQUE, \
                                                net_range VARCHAR(255), net_type VARCHAR(255), \
                                                    asn VARCHAR(255), org_cust_id VARCHAR(255), updated VARCHAR(255));"
organization_table = "CREATE TABLE organization (org_id VARCHAR(255) UNIQUE, \
                                                            org_name VARCHAR(255), updated VARCHAR(255));"
customer_table = "CREATE TABLE customer (cust_id VARCHAR(255) UNIQUE, \
                                                            cust_name VARCHAR(255), updated VARCHAR(255));"

In [35]:
########
create_msql_database('ThinkCX')
create_msql_table(network_table)
#create_msql_table(organization_table)
#create_msql_table(customer_table)

Database exists
Connecting to MYSQL database...
Connected to MYSQL database
Creating Mysql table...
Error:  1050 (42S01): Table 'network' already exists
Connecting to MYSQL database...
Connected to MYSQL database
Creating Mysql table...
Error:  1050 (42S01): Table 'organization' already exists
Connecting to MYSQL database...
Connected to MYSQL database
Creating Mysql table...
Table created


In [9]:
insert_msql_table('/Users/hople/working_folder/Bootcamp_practices/SQL_practice/arin_data_sets/arin_net.tsv', 'network')

Connecting to MYSQL database...
Connected to MYSQL database
creating MYSQL table...
Error:  1064 (42000): You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'ON DUPLICATE KEY UPDATE
                                    net_name = VALUES(ne' at line 6


In [1]:
import pandas as pd
df_net = pd.read_csv('./arin_data_sets/arin_net.tsv', sep='\t')
df_org = pd.read_csv('./arin_data_sets/arin_org.tsv', sep='\t', encoding='latin1')
df_cust = pd.read_csv('./arin_data_sets/arin_cust.tsv', sep='\t', encoding='latin1')

#drop dulicate and update new values when there is duplicate in net_id and cidr
tempt_df = df_net[df_net.duplicated(subset=['net_id'], keep=False)]

for _, row in tempt_df.iterrows():
    duplicate_mask = df_net['net_id'] == row['net_id']
    df_net.loc[duplicate_mask, ['net_name', 'cidr', 'net_range', 'net_type', 'asn', 'org_cust_id']] = row[['net_name', 'cidr', 'net_range', 'net_type', 'asn', 'org_cust_id']]

drop_netid_dup_df = df_net.drop_duplicates(subset=['net_id'])

#drop dulicate and update new values when there is duplicate in net_id and cidr
tempt_df = drop_netid_dup_df[drop_netid_dup_df.duplicated(subset=['cidr'], keep=False)]

for _, row in tempt_df.iterrows():
    duplicate_mask = drop_netid_dup_df['cidr'] == row['cidr']
    drop_cidr_dup_df = drop_netid_dup_df.copy()
    drop_cidr_dup_df.loc[duplicate_mask, ['net_id', 'net_name', 'net_range', 'net_type', 'asn', 'org_cust_id']] = row[['net_id', 'net_name', 'net_range', 'net_type', 'asn', 'org_cust_id']]

drop_cidr_dup_df = drop_cidr_dup_df.drop_duplicates(subset=['cidr'])

In [3]:
drop_cidr_dup_df.shape

(64306, 7)

In [2]:
len(df_net), len(df_org), len(df_cust), len(drop_cidr_dup_df)

(66520, 6325, 43238)

In [3]:
df_net

Unnamed: 0,net_id,net_name,cidr,net_range,net_type,asn,org_cust_id
0,net-104-129-224-0-1,aece-net,104.129.224.0/20,104.129.224.0 - 104.129.239.255,direct assignment,as00000,ai-335
1,net-104-129-96-0-1,xplornet-015,104.129.96.0/19,104.129.96.0 - 104.129.127.255,direct allocation,as22995,barre-2
2,net-104-157-108-0-1,telus-fibre-clgrab22,104.157.108.0/22,104.157.108.0 - 104.157.111.255,reassigned,as852,c07972290
3,net-104-157-79-0-1,telus-hsia-ftmmab3,104.157.79.0/26,104.157.79.0 - 104.157.79.63,reassigned,as852,c06849631
4,net-104-171-48-0-1,cikcable,104.171.48.0/20,104.171.48.0 - 104.171.63.255,direct allocation,"as54614, as174, as6949",cikte
...,...,...,...,...,...,...,...
66515,net-99-254-120-0-1,hsi,99.254.120.0/23,99.254.120.0 - 99.254.121.255,reassigned,,c02172893
66516,net-99-255-10-0-1,hsi,99.255.10.0/23,99.255.10.0 - 99.255.11.255,reassigned,,c02171770
66517,net-99-255-228-0-1,hsi,99.255.228.0/23,99.255.228.0 - 99.255.229.255,reassigned,,c02174354
66518,net-99-255-230-0-1,hsi,99.255.230.0/23,99.255.230.0 - 99.255.231.255,reassigned,,c02174355


In [4]:
df_org

Unnamed: 0,org_id,org_name
0,807cit,807-city
1,aad-2,aad
2,ablcan-1,abl canada inc.
3,adcogn,adcognito inc.
4,aedint,aed internet inc.
...,...,...
6320,xplor-2,xplornet limited
6321,xtc-5,xbase technologies corp.
6322,youngm,young monkey
6323,zenla-7,zenlayer inc


In [5]:
df_cust

Unnamed: 0,cust_id,cust_name
0,c00001342,canada ports corporation
1,c00001343,canada ports corporation
2,c00001888,stentor canadian network management
3,c00001991,weldwood of canada limited
4,c00002827,mcmillan bathurst
...,...,...
43233,c08030323,telus-fibre-schlbc01
43234,c08030382,telus-fibre-clgrab09
43235,c08030568,telus-dsl-srrybc01
43236,c08030664,telus-fibre-klwnbc02


In [8]:
#Number of CIDRs for the following org_cust_id: "bdio"

filtered_net = df_net[df_net['org_cust_id'] == "bdio"]
filtered_net.shape

(4286, 7)

In [9]:
#Number of CIDRs where the following value is in the asn field: "as852"

filtered_asn = df_net[df_net['asn'].str.contains("as852", na=False)]
filtered_asn.shape

(2418, 7)

In [12]:
#Provide list of top 10 organization names ranked in descending order of number of CIDRs
# (ties should be broken by ascending alphabetic order)

net_org_df = pd.merge(df_net, df_org, how='left', left_on='org_cust_id', right_on='org_id')