In [6]:
from pyhive import hive

# 1. Connect to HiveServer2.
# Note: We connect initially to the 'default' database so that we can create our target database if needed.
conn = hive.Connection(host='hive-server', port=10000, username='hive', database='default')
cursor = conn.cursor()

# 2. Create the target database and switch to it.
cursor.execute("CREATE DATABASE IF NOT EXISTS cancer_db")
cursor.execute("USE cancer_db")

# 3. Drop the table if it already exists.
cursor.execute("DROP TABLE IF EXISTS cancer_table")

# 4. Create the table with columns matching your CSV.
# Adjust the data types as needed. Here, we assume that most numeric columns are INT and level is a STRING.
cursor.execute("""
  CREATE TABLE cancer_table (
    patient_id STRING,
    age INT,
    gender INT,
    air_pollution INT,
    alcohol_use INT,
    dust_allergy INT,
    occupational_hazards INT,
    genetic_risk INT,
    chronic_lung_disease INT,
    balanced_diet INT,
    obesity INT,
    smoking INT,
    passive_smoker INT,
    chest_pain INT,
    coughing_of_blood INT,
    fatigue INT,
    weight_loss INT,
    shortness_of_breath INT,
    wheezing INT,
    swallowing_difficulty INT,
    clubbing_of_finger_nails INT,
    frequent_cold INT,
    dry_cough INT,
    snoring INT,
    level STRING
  )
  ROW FORMAT DELIMITED
  FIELDS TERMINATED BY ','
  STORED AS TEXTFILE
  TBLPROPERTIES ("skip.header.line.count"="1")
""")

# 5. Load the data into the table.
# Use the LOCAL keyword so that Hive looks for the file on the clientâ€™s local filesystem.
cursor.execute("""
  LOAD DATA LOCAL INPATH '/data/Cancer.csv'
  OVERWRITE INTO TABLE cancer_table
""")

print("Data loaded successfully into cancer_db.cancer_table.")

# 6. Optionally, query the table to verify.
cursor.execute("SELECT * FROM cancer_table LIMIT 10")
rows = cursor.fetchall()
print("First 10 rows in cancer_table:")
for row in rows:
    print(row)

# Close connection
cursor.close()
conn.close()


Data loaded successfully into cancer_db.cancer_table.
First 10 rows in cancer_table:
('P1', 33, 1, 2, 4, 5, 4, 3, 2, 2, 4, 3, 2, 2, 4, 3, 4, 2, 2, 3, 1, 2, 3, 4, 'Low')
('P10', 17, 1, 3, 1, 5, 3, 4, 2, 2, 2, 2, 4, 2, 3, 1, 3, 7, 8, 6, 2, 1, 7, 2, 'Medium')
('P100', 35, 1, 4, 5, 6, 5, 5, 4, 6, 7, 2, 3, 4, 8, 8, 7, 9, 2, 1, 4, 6, 7, 2, 'High')
('P1000', 37, 1, 7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 8, 4, 2, 3, 1, 4, 5, 6, 7, 5, 'High')
('P101', 46, 1, 6, 8, 7, 7, 7, 6, 7, 7, 8, 7, 7, 9, 3, 2, 4, 1, 4, 2, 4, 2, 3, 'High')
('P102', 35, 1, 4, 5, 6, 5, 5, 4, 6, 7, 2, 3, 4, 8, 8, 7, 9, 2, 1, 4, 6, 7, 2, 'High')
('P103', 52, 2, 2, 4, 5, 4, 3, 2, 2, 4, 3, 2, 2, 4, 3, 4, 2, 2, 3, 1, 2, 3, 4, 'Low')
('P104', 28, 2, 3, 1, 4, 3, 2, 3, 4, 3, 1, 4, 3, 1, 3, 2, 2, 4, 2, 2, 3, 4, 3, 'Low')
('P105', 35, 2, 4, 5, 6, 5, 6, 5, 5, 5, 6, 6, 6, 5, 1, 4, 3, 2, 4, 6, 2, 4, 1, 'Medium')
('P106', 46, 1, 2, 3, 4, 2, 4, 3, 3, 3, 2, 3, 4, 4, 1, 2, 4, 6, 5, 4, 2, 1, 5, 'Medium')
