In [1]:
import os
%pwd

'c:\\Users\\hp\\Documents\\DS\\Complete Project\\03-Air-Quality-Index-Predictor\\research'

In [2]:
os.chdir("../")
%pwd

'c:\\Users\\hp\\Documents\\DS\\Complete Project\\03-Air-Quality-Index-Predictor'

In [9]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path  # Root directory where data will be stored
    secure_connect_bundle: Path  # Path to the secure connect bundle for connecting to AstraDB
    username: str  # AstraDB username
    password: str  # AstraDB password
    keyspace: str  # Keyspace in AstraDB
    table_name: List[str]  # List ofTable name in AstraDB
    region_name: str # Region name in AstraDB
    output_file: Path  # Path to save the downloaded data

In [6]:
from Air_Quality_Predictor.constants import *
from Air_Quality_Predictor.utils.common import read_yaml, create_directories

In [12]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            secure_connect_bundle = config.secure_connect_bundle, 
            username = config.username, 
            password = config.password, 
            keyspace = config.keyspace,  
            table_name = config.table_name, 
            region_name = config.region_name,
            output_file = config.output_file  
        )

        return data_ingestion_config

In [14]:
from cassandra.cluster import Cluster
import pandas as pd
from cassandra.auth import PlainTextAuthProvider
from Air_Quality_Predictor.logging import logger


In [18]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        
    def download_data(self):
        # Create a connection to the AstraDB cluster
        protocol_version = 4
        cloud_config = {'secure_connect_bundle': str(self.config.secure_connect_bundle)}
        auth_provider = PlainTextAuthProvider(username=self.config.username, password=self.config.password)
        cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider,protocol_version=protocol_version)
        session = cluster.connect(self.config.keyspace)

        try:
            df_list = []
            for table_name in self.config.table_name:
                    query = f"SELECT * FROM {table_name}"
                    result_set = session.execute(query)
                    df = pd.DataFrame(list(result_set))
                    df_list.append(df)

            result_df = pd.concat(df_list, ignore_index=True)
            result_df.to_csv(self.config.output_file, index=False)
            logger.info(f"Data downloaded and saved to {self.config.output_file}")
                

        except Exception as e:
            logger.error(f"Error downloading data: {e}")
            raise e

        finally:
            # Close the session and cluster
            session.shutdown()
            cluster.shutdown()


In [20]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_data()
except Exception as e:
    raise e

[2024-05-01 15:50:38,654 : INFO : common : yaml file: config\config.yaml loaded successfully]
[2024-05-01 15:50:38,659 : INFO : common : yaml file: params.yaml loaded successfully]
[2024-05-01 15:50:38,662 : INFO : common : Created directory at: artifacts]
[2024-05-01 15:50:38,662 : INFO : common : Created directory at: artifacts/data_ingestion]
[2024-05-01 15:50:45,570 : INFO : policies : Using datacenter 'eu-west-1' for DCAwareRoundRobinPolicy (via host '5af7119b-a6dd-47a6-be00-196398f826d7-eu-west-1.db.astra.datastax.com:29042:b8df861c-a1c5-44ee-8ade-0c6e99701d67'); if incorrect, please specify a local_dc to the constructor, or limit contact points to local cluster nodes]
[2024-05-01 16:17:48,186 : ERROR : 3633161049 : Error downloading data: errors={'5af7119b-a6dd-47a6-be00-196398f826d7-eu-west-1.db.astra.datastax.com:29042:396fbeff-567f-440c-9442-8c9d393f6318': 'Client request timeout. See Session.execute[_async](timeout)'}, last_host=5af7119b-a6dd-47a6-be00-196398f826d7-eu-west-1

OperationTimedOut: errors={'5af7119b-a6dd-47a6-be00-196398f826d7-eu-west-1.db.astra.datastax.com:29042:396fbeff-567f-440c-9442-8c9d393f6318': 'Client request timeout. See Session.execute[_async](timeout)'}, last_host=5af7119b-a6dd-47a6-be00-196398f826d7-eu-west-1.db.astra.datastax.com:29042:396fbeff-567f-440c-9442-8c9d393f6318

In [None]:
from cassandra.cluster import Cluster
import pandas as pd
from cassandra.auth import PlainTextAuthProvider
# Define your Astra DB connection settings
username = 'pDXDFknYBnqRETdPytgyUFnk'
password = 'J9CqCaw1s8k4Af7HgaeS2+_P8Pq8RCtCp,6gZe_QFSZei.9lsSaGjuEYwkJPwRnhT0MNK6JYcQRrd+lELEa5YM_nFqL8p-ZPLw0nleayB1rmScWmuBrmEujuWu_m3.ab'
keyspace = 'data'
cluster_id = 'aqdata1'
region = "eu-west-1"

# Create a connection to your Astra DB cluster
cloud_config = {
    'secure_connect_bundle': 'C:\\Users\\HP\\Downloads\\secure-connect-air-quality-index-data.zip'
}
auth_provider = PlainTextAuthProvider(username=username, password=password)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect(keyspace)


# Define your query
query = "SELECT * FROM aqdata1"

# Execute the query and fetch the results
result_set = session.execute(query)

# Convert the result set to a DataFrame
df = pd.DataFrame(list(result_set))
print(df.head())
df.to_csv('aqdata1.csv')
# Close the session and cluster
session.shutdown()
cluster.shutdown()