In [1]:
import pandas as pd 
import numpy as np

In [2]:
#Load the dataset
df = pd.read_csv('screentime_analysis.csv')

df.head(5)

Unnamed: 0,Date,App,Usage (minutes),Notifications,Times Opened
0,2024-08-07,Instagram,81,24,57
1,2024-08-08,Instagram,90,30,53
2,2024-08-26,Instagram,112,33,17
3,2024-08-22,Instagram,82,11,38
4,2024-08-12,Instagram,59,47,16


In [3]:
#Data Preprocessing

#Check for missing values and duplicates
print(df.isnull().sum())
print(df.duplicated().sum())

Date               0
App                0
Usage (minutes)    0
Notifications      0
Times Opened       0
dtype: int64
0


In [4]:
#Extracting features from date column
df['Date'] = pd.to_datetime(df['Date']) #Covert to datetime
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.dayofweek

print(df.head())


        Date        App  Usage (minutes)  Notifications  Times Opened  Year  \
0 2024-08-07  Instagram               81             24            57  2024   
1 2024-08-08  Instagram               90             30            53  2024   
2 2024-08-26  Instagram              112             33            17  2024   
3 2024-08-22  Instagram               82             11            38  2024   
4 2024-08-12  Instagram               59             47            16  2024   

   Month  DayOfWeek  
0      8          2  
1      8          3  
2      8          0  
3      8          3  
4      8          0  


In [5]:
#Encoding Categorical Column i.e. "App"
df = pd.get_dummies(df, columns = ['App'], drop_first = True)

df.head()

Unnamed: 0,Date,Usage (minutes),Notifications,Times Opened,Year,Month,DayOfWeek,App_Facebook,App_Instagram,App_LinkedIn,App_Netflix,App_Safari,App_WhatsApp,App_X
0,2024-08-07,81,24,57,2024,8,2,False,True,False,False,False,False,False
1,2024-08-08,90,30,53,2024,8,3,False,True,False,False,False,False,False
2,2024-08-26,112,33,17,2024,8,0,False,True,False,False,False,False,False
3,2024-08-22,82,11,38,2024,8,3,False,True,False,False,False,False,False
4,2024-08-12,59,47,16,2024,8,0,False,True,False,False,False,False,False


In [7]:
#Scaling numerical features using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df[['Notifications','Times Opened']] = scaler.fit_transform(df[['Notifications','Times Opened']])

In [8]:
df.head()

Unnamed: 0,Date,Usage (minutes),Notifications,Times Opened,Year,Month,DayOfWeek,App_Facebook,App_Instagram,App_LinkedIn,App_Netflix,App_Safari,App_WhatsApp,App_X
0,2024-08-07,81,0.163265,0.571429,2024,8,2,False,True,False,False,False,False,False
1,2024-08-08,90,0.204082,0.530612,2024,8,3,False,True,False,False,False,False,False
2,2024-08-26,112,0.22449,0.163265,2024,8,0,False,True,False,False,False,False,False
3,2024-08-22,82,0.07483,0.377551,2024,8,3,False,True,False,False,False,False,False
4,2024-08-12,59,0.319728,0.153061,2024,8,0,False,True,False,False,False,False,False


In [9]:
#Feature Engineering

df['Previous Day Usage'] = df['Usage (minutes)'].shift(1)
df['Notificatons_x_TimesOpened'] = df['Notifications'] * df['Times Opened']

df.to_csv('preprocessed_screentime_data.csv', index=False)

In [10]:
preprocessed_data = pd.read_csv('preprocessed_screentime_data.csv')
preprocessed_data.head()

Unnamed: 0,Date,Usage (minutes),Notifications,Times Opened,Year,Month,DayOfWeek,App_Facebook,App_Instagram,App_LinkedIn,App_Netflix,App_Safari,App_WhatsApp,App_X,Previous Day Usage,Notificatons_x_TimesOpened
0,2024-08-07,81,0.163265,0.571429,2024,8,2,False,True,False,False,False,False,False,,0.093294
1,2024-08-08,90,0.204082,0.530612,2024,8,3,False,True,False,False,False,False,False,81.0,0.108288
2,2024-08-26,112,0.22449,0.163265,2024,8,0,False,True,False,False,False,False,False,90.0,0.036651
3,2024-08-22,82,0.07483,0.377551,2024,8,3,False,True,False,False,False,False,False,112.0,0.028252
4,2024-08-12,59,0.319728,0.153061,2024,8,0,False,True,False,False,False,False,False,82.0,0.048938


In [11]:
#Building a simple model to predict usage in minutes

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

X = df.drop(columns=['Usage (minutes)','Date'])
y = df['Usage (minutes)']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state= 42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train,y_train)

predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print('Mean Absolute Error:', mae)

Mean Absolute Error: 15.496500000000001


In [12]:
predictions

array([18.78, 74.64, 28.2 , 28.23, 60.21,  6.86, 54.23, 39.6 , 31.55,
       29.24, 63.2 , 16.44, 37.85, 15.86, 19.98, 24.27, 36.98, 32.96,
       20.56, 43.62, 10.22, 92.82, 68.84, 20.74, 58.23, 37.69, 18.51,
       39.63, 87.64, 83.05, 66.54, 93.49, 37.39, 75.79, 50.77, 15.96,
       68.9 , 69.05, 34.69, 18.25])

In [26]:
X_test 

Unnamed: 0,Notifications,Times Opened,Year,Month,DayOfWeek,App_Facebook,App_Instagram,App_LinkedIn,App_Netflix,App_Safari,App_WhatsApp,App_X,Previous Day Usage,Notificatons_x_TimesOpened
95,0.027211,0.142857,2024,8,5,False,False,False,False,False,False,False,21.0,0.003887
15,0.380952,0.489796,2024,8,3,False,True,False,False,False,False,False,89.0,0.186589
30,0.306122,0.102041,2024,8,2,False,False,False,False,False,False,True,41.0,0.031237
158,0.29932,0.204082,2024,8,2,True,False,False,False,False,False,False,28.0,0.061086
128,0.006803,0.010204,2024,8,5,False,False,False,True,False,False,False,108.0,6.9e-05
115,0.006803,0.020408,2024,8,2,False,False,False,False,True,False,False,20.0,0.000139
69,0.938776,0.795918,2024,8,2,False,False,False,False,False,True,False,59.0,0.747189
170,0.108844,0.336735,2024,8,1,True,False,False,False,False,False,False,54.0,0.036651
174,0.367347,0.214286,2024,8,2,True,False,False,False,False,False,False,19.0,0.078717
45,0.217687,0.102041,2024,8,0,False,False,False,False,False,False,True,34.0,0.022213


In [27]:
predictions_data = {'Notifications': 0.017,
                   'Times Opened': 0.385,
                   'Year': 2024,
                   'Month': 5,
                   'DayOfWeek': 1,
                   'App_Facebook': True,
                   'App_Instagram': False,
                   'App_LinkedIn': False,
                   'App_Netflix': False,
                   'App_Safari': False,
                   'App_WhatsApp': False,
                    'App_X': False,
                   'Previous Day Usage': 120,
                   'Notificatons_x_TimesOpened': 0.006545}

predictions_data = pd.DataFrame(predictions_data, index=[0])
pred = model.predict(predictions_data)
print('Predicted Usage:', pred[0])

Predicted Usage: 56.62


### Building a pipeline to automate the preprocessing using Apache Airflow

In [1]:
#pip install apache-airflow

#Create and define DAG

from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime

#Create preprocess function

def preprocess_data():
    file_path = 'screentime_analysis.csv'
    df = pd.read_csv(file_path)

    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['DayOfWeek'] = df['Date'].dt.dayofweek

    df= df.drop(columns=['Date'])

    df = pd.get_dummies(df, columns =['App'], drop_first = True)

    scaler = MinMaxScaler()
    df[['Notifications','Times Opened']] = scaler.fit_transform(df[['Notifications','Times Opened']])

    preprocessed_data_path = 'preprocessed_screentime_data.csv'
    df.to_csv(preprocessed_data_path, index=False)
    print(f"Preprocessed data saved to {preprocessed_data_path}")

#Create a DAG

dag = DAG(
    dag_id = 'data_preprocessing',
    schedule_interval='@daily',
    start_date=datetime(2025, 1, 1),
    catchup=False
)

#Define task

preprocess_task = PythonOperator(
    task_id = 'preprocess',
    python_callable=preprocess_data,
    dag=dag
)





### Testing and Running the pipeline in the terminal

In [None]:

#airflow db init or airflow db migrate

#airflow webserver -p 8080

#airflow scheduler

'''Naviagate to the Airflow UI at http://localhost:8080/ and enable the DAG. Once there, enable the data_preprocessing DAG and manually trigger it to execute the defined tasks. After the DAG has run successfully, validate the output by checking the preprocessed file to ensure it contains the updated and preprocessed data.'''