In [8]:
import pandas as pd
import io
import numpy as np

In [14]:
import numpy as np
import pandas as pd

def gen_random_input_data():
    # Parameters
    n_rows = 1000  # Total number of rows
    target_zeros = 600  # Target number of rows with maintenance_required = 0
    target_ones = 400  # Target number of rows with maintenance_required = 1
    tire_pressure_range = (28, 36)
    mileage_range = (5000, 20000)
    engine_temperature_range = (-20, 95)
    engine_rpm_range = (800, 7500)
    vehicle_speed_range = (0, 303)

    # Generate random data using numpy
    np.random.seed(42)
    vehicle_ids = [f"V{str(i).zfill(4)}" for i in range(1, n_rows + 1)]
    timestamps = pd.date_range(start="2025-01-01", periods=n_rows, freq="min")
    engine_temperatures = np.random.uniform(*engine_temperature_range, n_rows)
    tire_pressures = np.random.uniform(*tire_pressure_range, n_rows)
    engine_rpms = np.random.uniform(*engine_rpm_range, n_rows)
    vehicle_speeds = np.random.uniform(*vehicle_speed_range, n_rows)
    mileages = np.random.uniform(*mileage_range, n_rows)

    # Adjusted function for calculating maintenance_required
    def calculate_maintenance(tire_pressure, mileage, engine_temperature, engine_rpm, vehicle_speed):
        maintenance_score = (
            -0.5 * (tire_pressure - tire_pressure_range[0]) / (tire_pressure_range[1] - tire_pressure_range[0]) +
            0.9 * (mileage - mileage_range[0]) / (mileage_range[1] - mileage_range[0]) +
            0.7 * (engine_temperature - engine_temperature_range[0]) / (engine_temperature_range[1] - engine_temperature_range[0]) +
            0.8 * (engine_rpm - engine_rpm_range[0]) / (engine_rpm_range[1] - engine_rpm_range[0]) +
            0.8 * (vehicle_speed - vehicle_speed_range[0]) / (vehicle_speed_range[1] - vehicle_speed_range[0])
        )
        return 1 if maintenance_score > 1.8 else 0  # Adjusted threshold

    # Calculate maintenance_required
    maintenance_required = np.array([
        calculate_maintenance(tp, m, et, er, vs)
        for tp, m, et, er, vs in zip(tire_pressures, mileages, engine_temperatures, engine_rpms, vehicle_speeds)
    ])

    # Combine data into a DataFrame
    data = pd.DataFrame({
        "vehicle_id": vehicle_ids,
        "timestamp": timestamps,
        "engine_temperature": engine_temperatures,
        "tire_pressure": tire_pressures,
        "engine_rpm": engine_rpms,
        "vehicle_speed": vehicle_speeds,
        "mileage": mileages,
        "maintenance_required": maintenance_required
    })

    # Balance the maintenance_required column
    zeros_df = data[data["maintenance_required"] == 0]
    ones_df = data[data["maintenance_required"] == 1]

    # Adjust the size of each group
    if len(zeros_df) < target_zeros or len(ones_df) < target_ones:
        # Use oversampling if there aren't enough rows
        balanced_zeros = zeros_df.sample(target_zeros, replace=True, random_state=42)
        balanced_ones = ones_df.sample(target_ones, replace=True, random_state=42)
    else:
        # Use regular sampling if there are enough rows
        balanced_zeros = zeros_df.sample(target_zeros, random_state=42)
        balanced_ones = ones_df.sample(target_ones, random_state=42)

    # Combine the balanced groups
    balanced_data = pd.concat([balanced_zeros, balanced_ones]).sort_index()

    return balanced_data


In [15]:

def upload_dataframe_to_s3(df, bucket, s3_client, object_name=None):
    """ Uploads a Pandas DataFrame as a CSV file to an S3 bucket.

    :param df: Pandas DataFrame to upload
    :param bucket: Bucket to upload to
    :s3_client: s3 client env
    :param object_name: S3 object name. If not specified then output_file_name is used
    :return: True if file was uploaded, else False
    """
    try:
        # Convert the DataFrame to a CSV string
        csv_buffer = io.StringIO()
        df.to_csv(csv_buffer, index=False)
    # Upload the CSV string to S3
        s3_client.put_object(Bucket=bucket, Key=object_name, Body=csv_buffer.getvalue())
        print(f"DataFrame successfully uploaded to S3 bucket '{bucket}' as '{object_name}'.")
        return True
    except Exception as e:
        print(f"Error occurred while uploading DataFrame to S3: {e}")
        return False