In [7]:
import csv
import sqlite3
import os
from datetime import datetime

# Path to your CSV
csv_file_path = r"D:/Other/1- Masters Data/2- Big Data Course/project/mimic-iii-clinical-database-demo-1.4/mimic-iii-clinical-database-demo-1.4/ADMISSIONS.csv"

# Extract the CSV file name without extension
table_name = os.path.splitext(os.path.basename(csv_file_path))[0]

# Connect to SQLite database (this will create a file-based database)
conn = sqlite3.connect('mimiciii.db')  # Database will be saved as 'mimiciii.db'
cursor = conn.cursor()

# Function to infer column data type based on sample values
def infer_data_type(sample_values):
    # Try to match DATE (common formats like YYYY-MM-DD, MM/DD/YYYY)
    date_formats = ["%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"]
    for val in sample_values:
        if isinstance(val, str):
            for fmt in date_formats:
                try:
                    datetime.strptime(val, fmt)
                    return 'DATE'
                except ValueError:
                    continue
    
    # Try to detect BOOLEAN (True, False, 1, 0, yes, no, etc.)
    boolean_values = {"true", "false", "1", "0", "yes", "no"}
    for val in sample_values:
        if val.lower() in boolean_values:
            return 'BOOLEAN'

    # Check if it's an integer
    for val in sample_values:
        try:
            int(val)
            return 'INTEGER'
        except ValueError:
            pass

    # Check if it's a float
    for val in sample_values:
        try:
            float(val)
            return 'REAL'
        except ValueError:
            pass

    # If it can't be identified as numeric or boolean, default to TEXT
    return 'TEXT'


# Read the CSV file to detect schema
with open(csv_file_path, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    columns = reader.fieldnames
    # Create a dictionary to store data types for each column
    column_types = {}

    # Sample each column (use the first few rows) to detect types
    sample_rows = [next(reader) for _ in range(10)]  # Get 10 sample rows from the file
    for column in columns:
        # Get the sample values for each column
        sample_values = [row[column] for row in sample_rows if row[column] != '']
        
        # Infer the data type for the column based on the sample values
        if sample_values:
            column_types[column] = infer_data_type(sample_values)
        else:
            column_types[column] = 'TEXT'  # Default to TEXT if no data in the sample

    # Create table with columns and inferred data types
    create_table_query = f"CREATE TABLE {table_name} ({', '.join([col + ' ' + column_types[col] for col in columns])});"
    cursor.execute(create_table_query)

    # Reset the reader to start inserting data
    csvfile.seek(0)
    reader = csv.DictReader(csvfile)
    
    # Insert data into SQLite database
    for row in reader:
        placeholders = ', '.join(['?' for _ in row])
        insert_query = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({placeholders});"
        cursor.execute(insert_query, tuple(row.values()))

# Commit and check if successful
conn.commit()

# Now to inspect the schema of the created table
cursor.execute(f"PRAGMA table_info({table_name});")
schema = cursor.fetchall()

# Print the schema
print(f"Schema of '{table_name}' table:")
for column in schema:
    print(column)

# Close the connection
conn.close()

Schema of 'ADMISSIONS' table:
(0, 'row_id', 'INTEGER', 0, None, 0)
(1, 'subject_id', 'INTEGER', 0, None, 0)
(2, 'hadm_id', 'INTEGER', 0, None, 0)
(3, 'admittime', 'TEXT', 0, None, 0)
(4, 'dischtime', 'TEXT', 0, None, 0)
(5, 'deathtime', 'TEXT', 0, None, 0)
(6, 'admission_type', 'TEXT', 0, None, 0)
(7, 'admission_location', 'TEXT', 0, None, 0)
(8, 'discharge_location', 'TEXT', 0, None, 0)
(9, 'insurance', 'TEXT', 0, None, 0)
(10, 'language', 'TEXT', 0, None, 0)
(11, 'religion', 'TEXT', 0, None, 0)
(12, 'marital_status', 'TEXT', 0, None, 0)
(13, 'ethnicity', 'TEXT', 0, None, 0)
(14, 'edregtime', 'TEXT', 0, None, 0)
(15, 'edouttime', 'TEXT', 0, None, 0)
(16, 'diagnosis', 'TEXT', 0, None, 0)
(17, 'hospital_expire_flag', 'BOOLEAN', 0, None, 0)
(18, 'has_chartevents_data', 'BOOLEAN', 0, None, 0)


In [10]:
import csv
import sqlite3
import os
from datetime import datetime

# Folder path to your CSV files
folder_path = r"D:/Other/1- Masters Data/2- Big Data Course/project/mimic-iii-clinical-database-demo-1.4/mimic-iii-clinical-database-demo-1.4"

# Function to infer column data type based on sample values
def infer_data_type(sample_values):
    # Try to match DATE (common formats like YYYY-MM-DD, MM/DD/YYYY)
    date_formats = ["%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"]
    for val in sample_values:
        if isinstance(val, str):
            for fmt in date_formats:
                try:
                    datetime.strptime(val, fmt)
                    return 'DATE'
                except ValueError:
                    continue
    
    # Try to detect BOOLEAN (True, False, 1, 0, yes, no, etc.)
    boolean_values = {"true", "false", "1", "0", "yes", "no"}
    for val in sample_values:
        if val.lower() in boolean_values:
            return 'BOOLEAN'

    # Check if it's an integer
    for val in sample_values:
        try:
            int(val)
            return 'INTEGER'
        except ValueError:
            pass

    # Check if it's a float
    for val in sample_values:
        try:
            float(val)
            return 'REAL'
        except ValueError:
            pass

    # If it can't be identified as numeric or boolean, default to TEXT
    return 'TEXT'


# Loop through all CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        # Get the full file path
        csv_file_path = os.path.join(folder_path, filename)
        
        # Extract the table name (using the file name without the .csv extension)
        table_name = os.path.splitext(filename)[0]

        # Connect to SQLite database (this will create a file-based database)
        conn = sqlite3.connect('mimiciii.db')  # Database will be saved as 'mimiciii.db'
        cursor = conn.cursor()

        # Read the CSV file to detect schema
        with open(csv_file_path, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            columns = reader.fieldnames
            # Create a dictionary to store data types for each column
            column_types = {}

            # Get all rows and sample from them (use the first 10 or fewer if the file has less)
            all_rows = list(reader)
            sample_rows = all_rows[:10] if len(all_rows) >= 10 else all_rows  # Sample at most 10 rows
            for column in columns:
                # Get the sample values for each column
                sample_values = [row[column] for row in sample_rows if row[column] != '']
                
                # Infer the data type for the column based on the sample values
                if sample_values:
                    column_types[column] = infer_data_type(sample_values)
                else:
                    column_types[column] = 'TEXT'  # Default to TEXT if no data in the sample

            # Create table with columns and inferred data types (Only if it doesn't already exist)
            create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({', '.join([col + ' ' + column_types[col] for col in columns])});"
            cursor.execute(create_table_query)

            # Reset the reader to start inserting data
            csvfile.seek(0)
            reader = csv.DictReader(csvfile)
            
            # Insert data into SQLite database
            for row in reader:
                placeholders = ', '.join(['?' for _ in row])
                insert_query = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({placeholders});"
                cursor.execute(insert_query, tuple(row.values()))

        # Commit and check if successful
        conn.commit()

        # Now to inspect the schema of the created table
        cursor.execute(f"PRAGMA table_info({table_name});")
        schema = cursor.fetchall()

        # Print the schema
        print(f"Schema of '{table_name}' table:")
        for column in schema:
            print(column)

        # Close the connection
        conn.close()

Schema of 'ADMISSIONS' table:
(0, 'row_id', 'INTEGER', 0, None, 0)
(1, 'subject_id', 'INTEGER', 0, None, 0)
(2, 'hadm_id', 'INTEGER', 0, None, 0)
(3, 'admittime', 'TEXT', 0, None, 0)
(4, 'dischtime', 'TEXT', 0, None, 0)
(5, 'deathtime', 'TEXT', 0, None, 0)
(6, 'admission_type', 'TEXT', 0, None, 0)
(7, 'admission_location', 'TEXT', 0, None, 0)
(8, 'discharge_location', 'TEXT', 0, None, 0)
(9, 'insurance', 'TEXT', 0, None, 0)
(10, 'language', 'TEXT', 0, None, 0)
(11, 'religion', 'TEXT', 0, None, 0)
(12, 'marital_status', 'TEXT', 0, None, 0)
(13, 'ethnicity', 'TEXT', 0, None, 0)
(14, 'edregtime', 'TEXT', 0, None, 0)
(15, 'edouttime', 'TEXT', 0, None, 0)
(16, 'diagnosis', 'TEXT', 0, None, 0)
(17, 'hospital_expire_flag', 'BOOLEAN', 0, None, 0)
(18, 'has_chartevents_data', 'BOOLEAN', 0, None, 0)
Schema of 'CALLOUT' table:
(0, 'row_id', 'INTEGER', 0, None, 0)
(1, 'subject_id', 'INTEGER', 0, None, 0)
(2, 'hadm_id', 'INTEGER', 0, None, 0)
(3, 'submit_wardid', 'INTEGER', 0, None, 0)
(4, 'submit_

OperationalError: table PATIENTS has no column named gender

In [11]:
import csv
import sqlite3
import os
from datetime import datetime

# Folder path to your CSV files
folder_path = r"D:/Other/1- Masters Data/2- Big Data Course/project/mimic-iii-clinical-database-demo-1.4/mimic-iii-clinical-database-demo-1.4"

# Function to infer column data type based on sample values
def infer_data_type(sample_values):
    # Try to match DATE (common formats like YYYY-MM-DD, MM/DD/YYYY)
    date_formats = ["%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"]
    for val in sample_values:
        if isinstance(val, str):
            for fmt in date_formats:
                try:
                    datetime.strptime(val, fmt)
                    return 'DATE'
                except ValueError:
                    continue
    
    # Try to detect BOOLEAN (True, False, 1, 0, yes, no, etc.)
    boolean_values = {"true", "false", "1", "0", "yes", "no"}
    for val in sample_values:
        if val.lower() in boolean_values:
            return 'BOOLEAN'

    # Check if it's an integer
    for val in sample_values:
        try:
            int(val)
            return 'INTEGER'
        except ValueError:
            pass

    # Check if it's a float
    for val in sample_values:
        try:
            float(val)
            return 'REAL'
        except ValueError:
            pass

    # If it can't be identified as numeric or boolean, default to TEXT
    return 'TEXT'


# Loop through all CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        # Get the full file path
        csv_file_path = os.path.join(folder_path, filename)
        
        # Extract the table name (using the file name without the .csv extension)
        table_name = os.path.splitext(filename)[0]

        # Connect to SQLite database (this will create a file-based database)
        conn = sqlite3.connect('mimiciii.db')  # Database will be saved as 'mimiciii.db'
        cursor = conn.cursor()

        # Read the CSV file to detect schema
        with open(csv_file_path, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            columns = reader.fieldnames
            # Create a dictionary to store data types for each column
            column_types = {}

            # Get all rows and sample from them (use the first 10 or fewer if the file has less)
            all_rows = list(reader)
            sample_rows = all_rows[:10] if len(all_rows) >= 10 else all_rows  # Sample at most 10 rows
            for column in columns:
                # Get the sample values for each column
                sample_values = [row[column] for row in sample_rows if row[column] != '']
                
                # Infer the data type for the column based on the sample values
                if sample_values:
                    column_types[column] = infer_data_type(sample_values)
                else:
                    column_types[column] = 'TEXT'  # Default to TEXT if no data in the sample

            # Create table with columns and inferred data types (Only if it doesn't already exist)
            create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({', '.join([col + ' ' + column_types[col] for col in columns])});"
            cursor.execute(create_table_query)

            # Checking and adding missing columns dynamically
            cursor.execute(f"PRAGMA table_info({table_name});")
            existing_columns = [column[1] for column in cursor.fetchall()]
            missing_columns = [col for col in columns if col not in existing_columns]
            
            if missing_columns:
                for column in missing_columns:
                    # Infer the data type for the missing column from the CSV sample
                    column_data_type = infer_data_type([row[column] for row in sample_rows if row[column] != ''])
                    alter_query = f"ALTER TABLE {table_name} ADD COLUMN {column} {column_data_type};"
                    cursor.execute(alter_query)

            # Now insert data into the table
            csvfile.seek(0)  # Reset to start inserting data
            reader = csv.DictReader(csvfile)
            for row in reader:
                placeholders = ', '.join(['?' for _ in row])
                insert_query = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({placeholders});"
                cursor.execute(insert_query, tuple(row.values()))

        # Commit and check if successful
        conn.commit()

        # Now to inspect the schema of the created table
        cursor.execute(f"PRAGMA table_info({table_name});")
        schema = cursor.fetchall()

        # Print the schema
        print(f"Schema of '{table_name}' table:")
        for column in schema:
            print(column)

        # Close the connection
        conn.close()


Schema of 'ADMISSIONS' table:
(0, 'row_id', 'INTEGER', 0, None, 0)
(1, 'subject_id', 'INTEGER', 0, None, 0)
(2, 'hadm_id', 'INTEGER', 0, None, 0)
(3, 'admittime', 'TEXT', 0, None, 0)
(4, 'dischtime', 'TEXT', 0, None, 0)
(5, 'deathtime', 'TEXT', 0, None, 0)
(6, 'admission_type', 'TEXT', 0, None, 0)
(7, 'admission_location', 'TEXT', 0, None, 0)
(8, 'discharge_location', 'TEXT', 0, None, 0)
(9, 'insurance', 'TEXT', 0, None, 0)
(10, 'language', 'TEXT', 0, None, 0)
(11, 'religion', 'TEXT', 0, None, 0)
(12, 'marital_status', 'TEXT', 0, None, 0)
(13, 'ethnicity', 'TEXT', 0, None, 0)
(14, 'edregtime', 'TEXT', 0, None, 0)
(15, 'edouttime', 'TEXT', 0, None, 0)
(16, 'diagnosis', 'TEXT', 0, None, 0)
(17, 'hospital_expire_flag', 'BOOLEAN', 0, None, 0)
(18, 'has_chartevents_data', 'BOOLEAN', 0, None, 0)
Schema of 'CALLOUT' table:
(0, 'row_id', 'INTEGER', 0, None, 0)
(1, 'subject_id', 'INTEGER', 0, None, 0)
(2, 'hadm_id', 'INTEGER', 0, None, 0)
(3, 'submit_wardid', 'INTEGER', 0, None, 0)
(4, 'submit_

In [1]:
%pip install pyhive

Collecting pyhive
  Downloading PyHive-0.7.0.tar.gz (46 kB)
     ---------------------------------------- 0.0/46.5 kB ? eta -:--:--
     -------------------------- ------------- 30.7/46.5 kB 1.4 MB/s eta 0:00:01
     --------------------------------- ---- 41.0/46.5 kB 393.8 kB/s eta 0:00:01
     -------------------------------------- 46.5/46.5 kB 385.5 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting future (from pyhive)
  Downloading future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Downloading future-1.0.0-py3-none-any.whl (491 kB)
   ---------------------------------------- 0.0/491.3 kB ? eta -:--:--
    --------------------------------------- 10.2/491.3 kB ? eta -:--:--
   ----- ---------------------------------- 61.4/491.3 kB 1.1 MB/s eta 0:00:01
   --------- ---------------------------- 122.9/491.3 kB 901.1 kB/s eta 0:00:01
   ---------------- ----------------------- 204.8/491.3 kB 1.1 MB/s eta 0:00:01

In [3]:
%pip install thrift

Collecting thrift
  Downloading thrift-0.21.0.tar.gz (62 kB)
     ---------------------------------------- 0.0/62.5 kB ? eta -:--:--
     ------ --------------------------------- 10.2/62.5 kB ? eta -:--:--
     ------------------ ------------------- 30.7/62.5 kB 325.1 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/62.5 kB 372.4 kB/s eta 0:00:01
     -------------------------------------- 62.5/62.5 kB 418.2 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: thrift
  Building wheel for thrift (setup.py): started
  Building wheel for thrift (setup.py): finished with status 'done'
  Created wheel for thrift: filename=thrift-0.21.0-py3-none-any.whl size=157418 sha256=f9364e027f689592e72f9415dad2cd89662ef0dd17ec69cfe01948143f6a7eb8
  Stored in directory: c:\users\ahmed.nabawi\appdata\local\pip\cache\wheels\a4\d6\df\5863f830eabcbc85e1937583ab0aea7d653625d5006b27a780
S

In [5]:
%pip install thrift_sasl

Collecting thrift_sasl
  Downloading thrift_sasl-0.4.3-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting pure-sasl>=0.6.2 (from thrift_sasl)
  Downloading pure-sasl-0.6.2.tar.gz (11 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Downloading thrift_sasl-0.4.3-py2.py3-none-any.whl (8.3 kB)
Building wheels for collected packages: pure-sasl
  Building wheel for pure-sasl (setup.py): started
  Building wheel for pure-sasl (setup.py): finished with status 'done'
  Created wheel for pure-sasl: filename=pure_sasl-0.6.2-py3-none-any.whl size=11439 sha256=7de219a52e12d8976d2f80be6373dceb6a7c391d4de74d8724e1283d1238e754
  Stored in directory: c:\users\ahmed.nabawi\appdata\local\pip\cache\wheels\af\5e\ca\57ff2c5801d038e3d8b227a4fb492cd84e43a535d64a06f3f2
Successfully built pure-sasl
Installing collected packages: pure-sasl, thrift_sasl
Successfully installed pure-sasl-0.6.2 thrift_sasl-0.4.3
Note: you may need to restart the kernel to use 

In [None]:
from pyhive import hive
import csv
import os
from datetime import datetime
import thrift_sasl



# Folder path to your CSV files
folder_path = r"D:/Other/1- Masters Data/2- Big Data Course/project/mimic-iii-clinical-database-demo-1.4/mimic-iii-clinical-database-demo-1.4"

# Function to infer column data type based on sample values
def infer_data_type(sample_values):
    date_formats = ["%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"]
    for val in sample_values:
        if isinstance(val, str):
            for fmt in date_formats:
                try:
                    datetime.strptime(val, fmt)
                    return 'DATE'
                except ValueError:
                    continue
    
    boolean_values = {"true", "false", "1", "0", "yes", "no"}
    for val in sample_values:
        if val.lower() in boolean_values:
            return 'BOOLEAN'

    for val in sample_values:
        try:
            int(val)
            return 'INT'
        except ValueError:
            pass

    for val in sample_values:
        try:
            float(val)
            return 'FLOAT'
        except ValueError:
            pass

    return 'STRING'


# Hive connection setup

conn = hive.connect(host='your_hive_host', port=10000, username='hi')
# Hive connection setup#

# Loop through all CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        csv_file_path = os.path.join(folder_path, filename)
        table_name = os.path.splitext(filename)[0]

        # Open CSV file to read columns and sample data
        with open(csv_file_path, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            columns = reader.fieldnames
            sample_rows = [row for row in list(reader)[:10]]  # Sample at most 10 rows
            column_types = {}

            # Infer column data types
            for column in columns:
                sample_values = [row[column] for row in sample_rows if row[column] != '']
                if sample_values:
                    column_types[column] = infer_data_type(sample_values)
                else:
                    column_types[column] = 'STRING'  # Default to STRING if no data in the sample

            # Construct CREATE TABLE query
            create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ("
            create_table_query += ", ".join([f"{col} {column_types[col]}" for col in columns]) + ")"
            cursor.execute(create_table_query)

            # Load data into Hive
            load_data_query = f"LOAD DATA LOCAL INPATH '{csv_file_path}' INTO TABLE {table_name}"
            cursor.execute(load_data_query)

        print(f"Table '{table_name}' created and data loaded successfully.")
    
cursor.close()
conn.close()


ModuleNotFoundError: No module named 'thrift_sasl'