[Reference](https://medium.com/@tubelwj/python-optimus-library-a-powerful-tool-for-big-data-processing-and-etl-06d2d6f5b26a)

In [1]:
pip install optimuspyspark

In [2]:
from optimus import Optimus

# Create Optimus object
op = Optimus()

In [3]:
from optimus import Optimus

# Create Optimus object with specific Spark configuration
op = Optimus(master="local", app_name="optimus_test")

In [4]:
from optimus import Optimus

# Create Optimus object
op = Optimus()

# Load CSV file
df = op.load.csv('business.csv')
df.show()

In [5]:
from optimus import Optimus

# Create an Optimus object
op = Optimus()

# Load dataset
df = op.load.csv('business.csv')

# View basic information of the dataset
df.printSchema()

# Filter data
filtered_df = df.filter(df["total_amount"] > 2000.0)

# Select specific columns
selected_df = df.select("order_id", "total_amount")

# Create a new column
df = df.withColumn("profit_double", df["profit"] * 2)
df.show()

In [6]:
from optimus import Optimus

# Create an Optimus object
op = Optimus()

# Load dataset
df = op.load.csv('business.csv')

# Calculate the average value for each group
grouped_df = df.groupBy("group").agg({"total_amount": "mean"})
grouped_df.show()

In [7]:
# # Generate data summary report
summary = df.profiler.run()
print(summary)

In [8]:
# One-hot encoding
df = df.cols.one_hot_encode("column2")

# Feature selection
df = df.cols.select(["column1", "column3"])

In [9]:
# Create a data pipeline
pipeline = op.Pipeline()

# Add operations to the pipeline
pipeline.add("drop_missing", ["column1"])
pipeline.add("fill_na", "column2", value="Unknown")
pipeline.add("outliers_replace", ["column3"], method="median")

# Execute pipeline operations
df = pipeline.run(df)

In [10]:
from optimus import Optimus

# Create an Optimus object
op = Optimus()

# Load dataset
df = op.load.csv('business.csv')

# Remove duplicates
df = df.dropDuplicates()

# Handle missing values
df = df.fillna({"total_amount": 0})

# Detect outliers
df = df.outliers(columns=["total_amount"], method="z_score", threshold=3)
df.show()

In [11]:
from optimus import Optimus

# Create an Optimus object
op = Optimus()

# Load dataset
df = op.load.csv('business.csv')

# Convert data types
df = df.astype({"PCs": "int", "total_amount": "float"})
df.printSchema()

In [12]:
from optimus import Optimus

# Create an Optimus object
op = Optimus()

# Load dataset
df = op.load.csv('business.csv')

# Plot a histogram
df.plot.hist("total_amount", bins=10)

In [13]:
from optimus import Optimus
from sklearn.linear_model import LinearRegression

# Create an Optimus object
op = Optimus()

# Load dataset
df = op.load.csv('business.csv')

# Prepare data
X = df.select("PCs", "total_amount").toPandas().values
y = df.select("target").toPandas().values.ravel()

# Train model
model = LinearRegression()
model.fit(X, y)

# Prediction
predictions = model.predict(X)
print(predictions)

In [14]:
from optimus import Optimus
from pyspark.sql.functions import col

# Create an Optimus object
op = Optimus()

# Simulate real-time data stream
data = [{"time": i, "value": i * 4+ (i % 6)} for i in range(3000)]

# Convert to an Optimus DataFrame
df = op.create.df(data)

# Compute rolling average in real-time
df = df.withColumn("rolling_mean", df["value"].rolling(15).mean())
df.show()

In [15]:
from optimus import Optimus
from pyspark.sql.functions import col

# Create an Optimus object
op = Optimus()

# Load geospatial dataset
df = op.load.csv('geospatial_location_data.csv')

# Convert geospatial data to float
df = df.withColumn("longitude", col("longitude").cast("float"))
df = df.withColumn("latitude", col("latitude").cast("float"))

# Compute the average values for each region
agg_df = df.groupBy("region").agg({"latitude": "mean", "longitude": "mean"})
agg_df.show()

In [16]:
from optimus import Optimus
from pyspark.sql.functions import col

# Create an Optimus object
op = Optimus()

# Load financial dataset
df = op.load.csv('financial_data.csv')

# Calculate stock returns
df = df.withColumn("return", (col("close") / col("close").shift(1)) - 1)

# Aggregate to calculate monthly returns
monthly_returns = df.groupBy(df["date"].dt.to_period("M")).agg({"return": "sum"})
monthly_returns.show()