### Import required Libraries

In [2]:
import os
import pandas as pd
import logging
from sqlalchemy import create_engine
from PIL import Image

Setup logging for better tracking of progress

In [2]:
logging.basicConfig(filename='logfile.log',level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

 #### Convert the scrapped JSON files to DataFrame

In [3]:
def json_to_dataframe(json_path):
    """Converts a JSON file to a pandas DataFrame."""
    try:
        df = pd.read_json(json_path)
        logging.info(f"Successfully loaded data from {json_path}")
        return df
    except ValueError as e:
        logging.error(f"Error reading {json_path}: {e}")
        return pd.DataFrame()  


### Clean the DataFrame by handling missing values, duplicates, etc.

In [4]:
def clean_dataframe(df):
    
    if df.empty:
        logging.warning("DataFrame is empty. Skipping cleaning.")
        return df
    
    df.drop_duplicates(inplace=True)
    df.fillna(method='ffill', inplace=True)  # Forward fill for missing values
    df.fillna(method='bfill', inplace=True)  # Backward fill if any remain
    
    # Convert date columns to datetime
    for col in df.columns:
        if 'date' in col.lower():
            df[col] = pd.to_datetime(df[col], errors='coerce')

    logging.info("Data cleaning completed.")
    return df

### Save the cleaned data and store in a database

In [5]:
# Save cleaned DataFrame as a CSV file
def save_cleaned_data(df, output_path):
    df.to_csv(output_path, index=False)
    logging.info(f"Cleaned data saved to {output_path}")

# Function to store DataFrame in a database
def store_in_database(df, table_name, database_url):
    engine = create_engine(database_url)
    df.to_sql(table_name, engine, if_exists='replace', index=False)
    logging.info(f"Data stored in the '{table_name}' table.")

#### Main function to process all JSON files 

In [6]:

def process_all_json_files(input_folder, output_folder, database_url=None):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Loop through each JSON file in the input folder
    for json_file in os.listdir(input_folder):
        if json_file.endswith('.json'):
            input_path = os.path.join(input_folder, json_file)
            logging.info(f"Processing file: {input_path}")
            
            # Convert JSON to DataFrame
            df = json_to_dataframe(input_path)
            
            # Clean the DataFrame
            cleaned_df = clean_dataframe(df)
            
            # Define output file path
            output_file = os.path.join(output_folder, f"cleaned_{json_file.replace('.json', '.csv')}")
            
            # Save the cleaned data
            save_cleaned_data(cleaned_df, output_file)
            logging.info(f"Saved cleaned data to: {output_file}")
            
            # Optionally, store the cleaned data in a database
            if database_url:
                table_name = os.path.splitext(json_file)[0]  # Use the file name as the table name
                store_in_database(cleaned_df, table_name, database_url)

# Example usage
input_folder = 'C:/Users/elbet/OneDrive/Desktop/Ten/week-7/github/Building-a-data-warehouse/scrapped_json_files'  
output_folder = 'C:/Users/elbet/OneDrive/Desktop/Ten/week-7/github/Building-a-data-warehouse/cleaned_files' 
database_url = 'postgresql://postgres:qwer1234@127.0.0.1:5432/warehouse' 

# Process all JSON files and optionally store in a database
process_all_json_files(input_folder, output_folder, database_url)

  df.fillna(method='ffill', inplace=True)  # Forward fill for missing values
  df.fillna(method='bfill', inplace=True)  # Backward fill if any remain
  df.fillna(method='ffill', inplace=True)  # Forward fill for missing values
  df.fillna(method='bfill', inplace=True)  # Backward fill if any remain
  df.fillna(method='ffill', inplace=True)  # Forward fill for missing values
  df.fillna(method='bfill', inplace=True)  # Backward fill if any remain
  df.fillna(method='ffill', inplace=True)  # Forward fill for missing values
  df.fillna(method='bfill', inplace=True)  # Backward fill if any remain
  df.fillna(method='ffill', inplace=True)  # Forward fill for missing values
  df.fillna(method='bfill', inplace=True)  # Backward fill if any remain


### Image data Preprocessing

In [3]:
# Function to resize images for object detection (YOLO)
def resize_image(image_path, output_size=(512, 512)):
    try:
        with Image.open(image_path) as img:
            img = img.resize(output_size)
            img.save(image_path)  # Overwrite with resized image
            logging.info(f"Resized image: {image_path}")
    except Exception as e:
        logging.error(f"Error resizing image {image_path}: {e}")
        try:
            os.remove(image_path)  # Delete the image if resizing fails
            logging.info(f"Deleted non-resizable image: {image_path}")
        except Exception as delete_error:
            logging.error(f"Error deleting image {image_path}: {delete_error}")

# Apply the resize function to all images in the folder
def transform_images(image_folder):
    for image_file in os.listdir(image_folder):
        image_path = os.path.join(image_folder, image_file)
        resize_image(image_path)

# Transform all images in the 'images' directory
transform_images('C:/Users/elbet/OneDrive/Desktop/Ten/week-7/github/Building-a-data-warehouse/images/')

ERROR:root:Error resizing image C:/Users/elbet/OneDrive/Desktop/Ten/week-7/github/Building-a-data-warehouse/images/62.jpg: cannot identify image file 'C:\\Users\\elbet\\OneDrive\\Desktop\\Ten\\week-7\\github\\Building-a-data-warehouse\\images\\62.jpg'
ERROR:root:Error resizing image C:/Users/elbet/OneDrive/Desktop/Ten/week-7/github/Building-a-data-warehouse/images/73.jpg: cannot identify image file 'C:\\Users\\elbet\\OneDrive\\Desktop\\Ten\\week-7\\github\\Building-a-data-warehouse\\images\\73.jpg'
ERROR:root:Error resizing image C:/Users/elbet/OneDrive/Desktop/Ten/week-7/github/Building-a-data-warehouse/images/89.jpg: cannot identify image file 'C:\\Users\\elbet\\OneDrive\\Desktop\\Ten\\week-7\\github\\Building-a-data-warehouse\\images\\89.jpg'
