Cell 1: Install Required Libraries
Install the necessary libraries for the project.

This project showcases the power of MLOps in streamlining machine learning workflows. By automating data preprocessing and model training using Apache Airflow, the pipeline ensures efficiency, scalability, and reproducibility. This approach is essential for maintaining high standards of data quality and model performance in production environments.


In [None]:
!pip install pandas scikit-learn apache-airflow

Cell 2: Import Required Libraries
Import the libraries needed for the project.

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime

Cell 3: Load and Preprocess the Dataset
Load the dataset and perform preprocessing steps.

In [None]:
# Load the dataset
data = pd.read_csv('screentime_analysis.csv')

# Check for missing values and duplicates
print(data.isnull().sum())
print(data.duplicated().sum())

# Convert Date column to datetime and extract features
data['Date'] = pd.to_datetime(data['Date'])
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['Month'] = data['Date'].dt.month

# Encode the categorical 'App' column using one-hot encoding
data = pd.get_dummies(data, columns=['App'], drop_first=True)

# Scale numerical features using MinMaxScaler
scaler = MinMaxScaler()
data[['Notifications', 'Times Opened']] = scaler.fit_transform(data[['Notifications', 'Times Opened']])

# Feature engineering
data['Previous_Day_Usage'] = data['Usage (minutes)'].shift(1)
data['Notifications_x_TimesOpened'] = data['Notifications'] * data['Times Opened']

# Save the preprocessed data to a file
data.to_csv('preprocessed_screentime_analysis.csv', index=False)

Cell 4: Train the Model
Train a Random Forest model to predict app usage.

In [None]:
# Split data into features and target variable
X = data.drop(columns=['Usage (minutes)', 'Date'])
y = data['Usage (minutes)']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error: {mae}')

Cell 5: Define the Data Preprocessing Function for Airflow
Define the function to preprocess data, which will be used in the Airflow DAG.

In [None]:
def preprocess_data():
    file_path = 'screentime_analysis.csv'
    data = pd.read_csv(file_path)

    data['Date'] = pd.to_datetime(data['Date'])
    data['DayOfWeek'] = data['Date'].dt.dayofweek
    data['Month'] = data['Date'].dt.month

    data = data.drop(columns=['Date'])

    data = pd.get_dummies(data, columns=['App'], drop_first=True)

    scaler = MinMaxScaler()
    data[['Notifications', 'Times Opened']] = scaler.fit_transform(data[['Notifications', 'Times Opened']])

    preprocessed_path = 'preprocessed_screentime_analysis.csv'
    data.to_csv(preprocessed_path, index=False)
    print(f"Preprocessed data saved to {preprocessed_path}")

Cell 6: Define the Airflow DAG
Define the Airflow DAG to schedule the preprocessing task.

In [None]:
dag = DAG(
    dag_id='data_preprocessing',
    schedule_interval='@daily',
    start_date=datetime(2025, 1, 1),
    catchup=False,
)

preprocess_task = PythonOperator(
    task_id='preprocess',
    python_callable=preprocess_data,
    dag=dag,
)