### This notebook demonstrates the machine learning pipeline and includes the Airflow DAG definition.
### Airflow is not executed here due to OS limitations (Airflow requires Linux/WSL2), but the pipeline structure, scheduling logic, and tasks are clearly defined.

In [1]:
pip install apache-airflow


Collecting apache-airflow
  Downloading apache_airflow-3.1.7-py3-none-any.whl.metadata (36 kB)
Collecting apache-airflow-core==3.1.7 (from apache-airflow)
  Downloading apache_airflow_core-3.1.7-py3-none-any.whl.metadata (6.4 kB)
Collecting apache-airflow-task-sdk==1.1.7 (from apache-airflow)
  Downloading apache_airflow_task_sdk-1.1.7-py3-none-any.whl.metadata (3.9 kB)
Collecting a2wsgi>=1.10.8 (from apache-airflow-core==3.1.7->apache-airflow)
  Downloading a2wsgi-1.10.10-py3-none-any.whl.metadata (4.0 kB)
Collecting aiosqlite<0.22.0,>=0.20.0 (from apache-airflow-core==3.1.7->apache-airflow)
  Downloading aiosqlite-0.21.0-py3-none-any.whl.metadata (4.3 kB)
Collecting alembic<2.0,>=1.13.1 (from apache-airflow-core==3.1.7->apache-airflow)
  Downloading alembic-1.18.3-py3-none-any.whl.metadata (7.2 kB)
Collecting apache-airflow-providers-common-compat>=1.7.4 (from apache-airflow-core==3.1.7->apache-airflow)
  Downloading apache_airflow_providers_common_compat-1.13.0-py3-none-any.whl.meta


[notice] A new release of pip is available: 24.3.1 -> 26.0.1
[notice] To update, run: C:\Users\anjal\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# load the dataset
data = pd.read_csv('screentime_analysis.csv')

# check for missing values and duplicates
print(data.isnull().sum())
print(data.duplicated().sum())

# convert Date column to datetime and extract features
data['Date'] = pd.to_datetime(data['Date'])
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['Month'] = data['Date'].dt.month

# encode the categorical 'App' column using one-hot encoding
data = pd.get_dummies(data, columns=['App'], drop_first=True)

# scale numerical features using MinMaxScaler
scaler = MinMaxScaler()
data[['Notifications', 'Times Opened']] = scaler.fit_transform(data[['Notifications', 'Times Opened']])

# feature engineering
data['Previous_Day_Usage'] = data['Usage (minutes)'].shift(1)
data['Notifications_x_TimesOpened'] = data['Notifications'] * data['Times Opened']

# save the preprocessed data to a file
data.to_csv('preprocessed_screentime_analysis.csv', index=False)

Date               0
App                0
Usage (minutes)    0
Notifications      0
Times Opened       0
dtype: int64
0


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# split data into features and target variable
X = data.drop(columns=['Usage (minutes)', 'Date'])
y = data['Usage (minutes)']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# evaluate the model
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 15.398500000000002


In [5]:
from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def preprocess_data():
    data = pd.read_csv('screentime_analysis.csv')

    data['Date'] = pd.to_datetime(data['Date'])
    data['DayOfWeek'] = data['Date'].dt.dayofweek
    data['Month'] = data['Date'].dt.month

    data = data.drop(columns=['Date'])
    data = pd.get_dummies(data, columns=['App'], drop_first=True)

    scaler = MinMaxScaler()
    data[['Notifications', 'Times Opened']] = scaler.fit_transform(
        data[['Notifications', 'Times Opened']]
    )

    data.to_csv('preprocessed_screentime_analysis.csv', index=False)

dag = DAG(
    dag_id='data_preprocessing',
    schedule='@daily',
    start_date=datetime(2025, 1, 1),
    catchup=False
)

preprocess_task = PythonOperator(
    task_id='preprocess',
    python_callable=preprocess_data,
    dag=dag
)
