# Dry Eye Disease Prediction

## Data Loading

In [19]:
# Importing dataset via kaggle

!kaggle datasets download -d dakshnagra/dry-eye-disease

Dataset URL: https://www.kaggle.com/datasets/dakshnagra/dry-eye-disease
License(s): DbCL-1.0
Downloading dry-eye-disease.zip to /home/bhxveshhh/ML/Thyroid Disease Prediction
100%|█████████████████████████████████████████| 418k/418k [00:01<00:00, 336kB/s]
100%|█████████████████████████████████████████| 418k/418k [00:01<00:00, 336kB/s]


In [3]:
# Exporting dataset from zipfile

import zipfile
zip_ref = zipfile.ZipFile('/home/bhxveshhh/ML/Dry Eye Disease Prediction/dry-eye-disease.zip', 'r')
zip_ref.extractall('/home/bhxveshhh/ML/Dry Eye Disease Prediction')
zip_ref.close()

## Exploratory Data Analysis

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('Dry_Eye_Dataset.csv')

In [6]:
df.head(5)

Unnamed: 0,Gender,Age,Sleep duration,Sleep quality,Stress level,Blood pressure,Heart rate,Daily steps,Physical activity,Height,...,Smoking,Medical issue,Ongoing medication,Smart device before bed,Average screen time,Blue-light filter,Discomfort Eye-strain,Redness in eye,Itchiness/Irritation in eye,Dry Eye Disease
0,F,24,9.5,2,1,137/89,67,3000,31,161,...,N,Y,Y,N,8.7,N,Y,Y,N,Y
1,M,39,9.6,2,3,108/64,60,12000,74,164,...,N,Y,Y,N,9.6,Y,N,N,Y,Y
2,F,45,5.4,1,5,134/81,95,12000,93,179,...,N,N,N,Y,4.0,N,Y,N,N,N
3,F,45,5.4,4,5,110/90,78,19000,32,160,...,N,Y,N,N,7.6,N,Y,N,Y,N
4,F,42,5.7,3,2,99/67,72,4000,173,179,...,N,Y,N,N,3.5,N,Y,Y,N,Y


In [7]:
df.shape

(20000, 26)

In [8]:
df.size

520000

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Gender                       20000 non-null  object 
 1   Age                          20000 non-null  int64  
 2   Sleep duration               20000 non-null  float64
 3   Sleep quality                20000 non-null  int64  
 4   Stress level                 20000 non-null  int64  
 5   Blood pressure               20000 non-null  object 
 6   Heart rate                   20000 non-null  int64  
 7   Daily steps                  20000 non-null  int64  
 8   Physical activity            20000 non-null  int64  
 9   Height                       20000 non-null  int64  
 10  Weight                       20000 non-null  int64  
 11  Sleep disorder               20000 non-null  object 
 12  Wake up during night         20000 non-null  object 
 13  Feel sleepy duri

In [10]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,20000.0,31.4228,8.103717,18.0,24.0,31.0,39.0,45.0
Sleep duration,20000.0,6.998245,1.731723,4.0,5.5,7.0,8.5,10.0
Sleep quality,20000.0,2.99725,1.412283,1.0,2.0,3.0,4.0,5.0
Stress level,20000.0,2.99375,1.407235,1.0,2.0,3.0,4.0,5.0
Heart rate,20000.0,79.9122,11.808279,60.0,70.0,80.0,90.0,100.0
Daily steps,20000.0,10536.9,5752.729186,1000.0,6000.0,11000.0,16000.0,20000.0
Physical activity,20000.0,90.06975,52.317283,0.0,45.0,91.0,135.0,180.0
Height,20000.0,174.8659,14.719903,150.0,162.0,175.0,188.0,200.0
Weight,20000.0,74.89185,14.733839,50.0,62.0,75.0,88.0,100.0
Average screen time,20000.0,5.519885,2.606305,1.0,3.3,5.5,7.8,10.0


In [11]:
df.isnull().sum()

Gender                         0
Age                            0
Sleep duration                 0
Sleep quality                  0
Stress level                   0
Blood pressure                 0
Heart rate                     0
Daily steps                    0
Physical activity              0
Height                         0
Weight                         0
Sleep disorder                 0
Wake up during night           0
Feel sleepy during day         0
Caffeine consumption           0
Alcohol consumption            0
Smoking                        0
Medical issue                  0
Ongoing medication             0
Smart device before bed        0
Average screen time            0
Blue-light filter              0
Discomfort Eye-strain          0
Redness in eye                 0
Itchiness/Irritation in eye    0
Dry Eye Disease                0
dtype: int64

In [12]:
df.duplicated().sum()

np.int64(0)

In [13]:
df.nunique()

Gender                            2
Age                              28
Sleep duration                   61
Sleep quality                     5
Stress level                      5
Blood pressure                 1581
Heart rate                       41
Daily steps                      20
Physical activity               181
Height                           51
Weight                           51
Sleep disorder                    2
Wake up during night              2
Feel sleepy during day            2
Caffeine consumption              2
Alcohol consumption               2
Smoking                           2
Medical issue                     2
Ongoing medication                2
Smart device before bed           2
Average screen time              91
Blue-light filter                 2
Discomfort Eye-strain             2
Redness in eye                    2
Itchiness/Irritation in eye       2
Dry Eye Disease                   2
dtype: int64

## Data Preprocessing

In [14]:
# Label encoding

from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoder for inverse transformation if needed

print(df)

       Gender  Age  Sleep duration  Sleep quality  Stress level  \
0           0   24             9.5              2             1   
1           1   39             9.6              2             3   
2           0   45             5.4              1             5   
3           0   45             5.4              4             5   
4           0   42             5.7              3             2   
...       ...  ...             ...            ...           ...   
19995       1   26             9.4              2             3   
19996       0   39             7.3              3             4   
19997       0   20             8.0              5             3   
19998       1   38             4.5              3             3   
19999       1   37             5.0              5             3   

       Blood pressure  Heart rate  Daily steps  Physical activity  Height  \
0                1176          67         3000                 31     161   
1                 252          60        

In [16]:
y=df["Dry Eye Disease"]
X=df.drop("Dry Eye Disease",axis=1)

In [17]:
# Train test and split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Building

In [18]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score

In [19]:
logistic_clf = LogisticRegression()
ridge_clf = RidgeClassifier()
xgboost_clf = XGBClassifier()
random_forest_clf = RandomForestClassifier()
ada_boost_clf = AdaBoostClassifier()
grad_boost_clf = GradientBoostingClassifier()
bagging_clf = BaggingClassifier()
decision_tree_clf = DecisionTreeClassifier()
svm_clf = SVC()

In [20]:
model_li = [logistic_clf, ridge_clf, xgboost_clf, random_forest_clf, 
            ada_boost_clf, grad_boost_clf, bagging_clf, decision_tree_clf, svm_clf]

scores = []
for model in model_li:
    model.fit(X_train, y_train)
    scores.append(accuracy_score(y_test, model.predict(X_test)))

print(scores)

[0.67325, 0.68775, 0.655, 0.69675, 0.6935, 0.701, 0.6085, 0.56475, 0.67325]
