In [1]:
import pandas as pd

# Example DataFrame creation
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 30, 22, 35],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
}

df = pd.DataFrame(data)

# Function to filter DataFrame by age
def filter_by_age(dataframe, age_threshold):
    """
    Filters the DataFrame to include only rows where the Age is greater than the specified threshold.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame to filter.
    age_threshold (int): The age threshold for filtering.

    Returns:
    pd.DataFrame: A new DataFrame containing only the rows that meet the age criteria.
    """
    return dataframe[dataframe['Age'] > age_threshold]

# Example usage of the filter function
filtered_df = filter_by_age(df, 25)
print("Filtered DataFrame (Age > 25):")
print(filtered_df)


Filtered DataFrame (Age > 25):
    Name  Age         City
1    Bob   30  Los Angeles
3  David   35      Houston


In [2]:
import pandas as pd
from sqlalchemy import create_engine
import logging
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import numpy as np

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Step 1: Create a dummy dataset
def create_dummy_data():
    """
    Create a dummy DataFrame for testing purposes.
    
    Returns:
    pd.DataFrame: A DataFrame containing dummy data.
    """
    logging.info("Creating dummy data.")
    data = {
        'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [24, 30, 22, 35],
        'City': ['New York', 'Los Angeles', 'Chicago', 'Houston'],
        'Salary': [70000, 80000, 60000, 90000]  # Added Salary for more complexity
    }
    return pd.DataFrame(data)

# Step 2: Data Processing
def process_data(dataframe):
    """
    Process the DataFrame by filtering and cleaning data.
    
    Parameters:
    dataframe (pd.DataFrame): The DataFrame to process.
    
    Returns:
    pd.DataFrame: The processed DataFrame.
    """
    logging.info("Processing data.")
    # Check if 'Age' column exists before filtering
    if 'Age' in dataframe.columns:
        processed_df = dataframe[dataframe['Age'] > 25]
        logging.info(f"Filtered data to {len(processed_df)} records where Age > 25.")
        
        # Standardize the Salary column
        if 'Salary' in processed_df.columns:
            scaler = StandardScaler()
            processed_df['Salary'] = scaler.fit_transform(processed_df[['Salary']])
            logging.info("Standardized the Salary column.")
        else:
            logging.warning("Salary column not found for standardization.")
    else:
        logging.error("The DataFrame does not contain an 'Age' column.")
        raise KeyError("The DataFrame does not contain an 'Age' column. Please check the input data.")
    
    return processed_df

# Step 3: Data Storage
def save_data(dataframe, db_connection_string, table_name):
    """
    Save the DataFrame to a SQL database.
    
    Parameters:
    dataframe (pd.DataFrame): The DataFrame to save.
    db_connection_string (str): The database connection string.
    table_name (str): The name of the table to save data to.
    """
    logging.info(f"Saving data to {table_name} in the database.")
    engine = create_engine(db_connection_string)
    try:
        dataframe.to_sql(table_name, con=engine, if_exists='replace', index=False)
        logging.info("Data saved successfully.")
    except Exception as e:
        logging.error(f"Error saving data: {e}")
        raise

# Step 4: Build and Execute the Data Pipeline
def run_data_pipeline(db_connection_string, table_name):
    """
    Execute the data pipeline: create, process, and save data.
    
    Parameters:
    db_connection_string (str): The database connection string.
    table_name (str): The name of the table to save data to.
    """
    try:
        # Create dummy data
        df = create_dummy_data()
        
        # Process data
        processed_df = process_data(df)
        
        # Save data to a database
        save_data(processed_df, db_connection_string, table_name)
    except Exception as e:
        logging.error(f"An error occurred during the data pipeline execution: {e}")

# Example usage
if __name__ == "__main__":
    run_data_pipeline('sqlite:///my_database.db', 'filtered_data')

2024-11-20 13:11:15,914 - INFO - Creating dummy data.
2024-11-20 13:11:15,917 - INFO - Processing data.
2024-11-20 13:11:15,920 - INFO - Filtered data to 2 records where Age > 25.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_df['Salary'] = scaler.fit_transform(processed_df[['Salary']])
2024-11-20 13:11:15,933 - INFO - Standardized the Salary column.
2024-11-20 13:11:15,936 - INFO - Saving data to filtered_data in the database.
2024-11-20 13:11:16,117 - INFO - Data saved successfully.
