In [2]:
import pandas as pd
import numpy as np

# 1. Load dataset

In [5]:
df = pd.read_csv("Indian_Kids_Screen_Time.csv")
df.head(10)

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
1,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
3,15,Female,1.21,Laptop,False,0.39,,Urban
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban
5,14,Female,4.88,Smartphone,True,0.44,Poor Sleep,Urban
6,17,Male,2.97,TV,False,0.48,,Rural
7,10,Male,2.74,TV,True,0.54,,Urban
8,14,Male,4.61,Laptop,True,0.36,"Poor Sleep, Anxiety",Rural
9,18,Male,3.24,Tablet,True,0.48,"Poor Sleep, Obesity Risk",Urban


# 2. Check for Missing Values

In [6]:
df.isnull().sum()

Age                                     0
Gender                                  0
Avg_Daily_Screen_Time_hr                0
Primary_Device                          0
Exceeded_Recommended_Limit              0
Educational_to_Recreational_Ratio       0
Health_Impacts                       3218
Urban_or_Rural                          0
dtype: int64

# 3. Handle missing values

In [8]:
df.dropna(inplace=True)
df

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
1,11,Female,4.61,Laptop,True,0.30,Poor Sleep,Urban
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban
5,14,Female,4.88,Smartphone,True,0.44,Poor Sleep,Urban
...,...,...,...,...,...,...,...,...
9707,17,Male,3.26,Smartphone,True,0.44,Poor Sleep,Urban
9708,17,Female,4.43,Smartphone,True,0.40,Poor Sleep,Rural
9709,16,Male,5.62,Smartphone,True,0.39,"Poor Sleep, Eye Strain, Anxiety",Rural
9710,17,Male,5.60,TV,True,0.43,Poor Sleep,Urban


# 4. Check Data Types & Clean Formatting

In [11]:
df.dtypes


Age                                    int64
Gender                                object
Avg_Daily_Screen_Time_hr             float64
Primary_Device                        object
Exceeded_Recommended_Limit              bool
Educational_to_Recreational_Ratio    float64
Health_Impacts                        object
Urban_or_Rural                        object
dtype: object

In [15]:
df['Age'] = df['Age'].astype(int)
df['Avg_Daily_Screen_Time_hr'] = pd.to_numeric(df['Avg_Daily_Screen_Time_hr'], errors='coerce')
df

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
1,11,Female,4.61,Laptop,True,0.30,Poor Sleep,Urban
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban
5,14,Female,4.88,Smartphone,True,0.44,Poor Sleep,Urban
...,...,...,...,...,...,...,...,...
9707,17,Male,3.26,Smartphone,True,0.44,Poor Sleep,Urban
9708,17,Female,4.43,Smartphone,True,0.40,Poor Sleep,Rural
9709,16,Male,5.62,Smartphone,True,0.39,"Poor Sleep, Eye Strain, Anxiety",Rural
9710,17,Male,5.60,TV,True,0.43,Poor Sleep,Urban


# Feature Engineering

In [17]:
# Calculate total screen ratio
df['screen_ratio'] = df['Educational_to_Recreational_Ratio'] / (df['Avg_Daily_Screen_Time_hr'] + 1e-5)
df['screen_ratio'] 

0       0.105263
1       0.065076
2       0.085791
4       0.083192
5       0.090164
          ...   
9707    0.134969
9708    0.090293
9709    0.069395
9710    0.076786
9711    0.053921
Name: screen_ratio, Length: 6494, dtype: float64

In [20]:
# Create age group
df['age_group'] = pd.cut(df['Age'],
                         bins=[7, 10, 13, 16, 18],
                         labels=['8-10', '11-13', '14-16', '17-18'])
df['age_group'].head(5)

0    14-16
1    11-13
2    17-18
4    11-13
5    14-16
Name: age_group, dtype: category
Categories (4, object): ['8-10' < '11-13' < '14-16' < '17-18']

In [22]:
# Label risk level based on screen time
df['risk_level'] = df['Avg_Daily_Screen_Time_hr'].apply(lambda x: 'High' if x > 5 else 'Moderate' if x > 3 else 'Low')

In [23]:
df

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural,screen_ratio,age_group,risk_level
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban,0.105263,14-16,Moderate
1,11,Female,4.61,Laptop,True,0.30,Poor Sleep,Urban,0.065076,11-13,Moderate
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban,0.085791,17-18,Moderate
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban,0.083192,11-13,High
5,14,Female,4.88,Smartphone,True,0.44,Poor Sleep,Urban,0.090164,14-16,Moderate
...,...,...,...,...,...,...,...,...,...,...,...
9707,17,Male,3.26,Smartphone,True,0.44,Poor Sleep,Urban,0.134969,17-18,Moderate
9708,17,Female,4.43,Smartphone,True,0.40,Poor Sleep,Rural,0.090293,17-18,Moderate
9709,16,Male,5.62,Smartphone,True,0.39,"Poor Sleep, Eye Strain, Anxiety",Rural,0.069395,14-16,High
9710,17,Male,5.60,TV,True,0.43,Poor Sleep,Urban,0.076786,17-18,High


 # Export Cleaned Data Back to CSV

In [24]:
df.to_csv("cleaned_kids_screentime.csv", index=False)

print("Cleaned CSV file saved successfully!")

Cleaned CSV file saved successfully!
