In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
df=pd.read_csv("teen_phone_addiction_dataset.csv")

In [5]:
df.shape

(3000, 25)

In [6]:
df.head()

Unnamed: 0,ID,Name,Age,Gender,Location,School_Grade,Daily_Usage_Hours,Sleep_Hours,Academic_Performance,Social_Interactions,...,Screen_Time_Before_Bed,Phone_Checks_Per_Day,Apps_Used_Daily,Time_on_Social_Media,Time_on_Gaming,Time_on_Education,Phone_Usage_Purpose,Family_Communication,Weekend_Usage_Hours,Addiction_Level
0,1,Shannon Francis,13,Female,Hansonfort,9th,4.0,6.1,78,5,...,1.4,86,19,3.6,1.7,1.2,Browsing,4,8.7,10.0
1,2,Scott Rodriguez,17,Female,Theodorefort,7th,5.5,6.5,70,5,...,0.9,96,9,1.1,4.0,1.8,Browsing,2,5.3,10.0
2,3,Adrian Knox,13,Other,Lindseystad,11th,5.8,5.5,93,8,...,0.5,137,8,0.3,1.5,0.4,Education,6,5.7,9.2
3,4,Brittany Hamilton,18,Female,West Anthony,12th,3.1,3.9,78,8,...,1.4,128,7,3.1,1.6,0.8,Social Media,8,3.0,9.8
4,5,Steven Smith,14,Other,Port Lindsaystad,9th,2.5,6.7,56,4,...,1.0,96,20,2.6,0.9,1.1,Gaming,10,3.7,8.6


In [7]:
df.shape

(3000, 25)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      3000 non-null   int64  
 1   Name                    3000 non-null   object 
 2   Age                     3000 non-null   int64  
 3   Gender                  3000 non-null   object 
 4   Location                3000 non-null   object 
 5   School_Grade            3000 non-null   object 
 6   Daily_Usage_Hours       3000 non-null   float64
 7   Sleep_Hours             3000 non-null   float64
 8   Academic_Performance    3000 non-null   int64  
 9   Social_Interactions     3000 non-null   int64  
 10  Exercise_Hours          3000 non-null   float64
 11  Anxiety_Level           3000 non-null   int64  
 12  Depression_Level        3000 non-null   int64  
 13  Self_Esteem             3000 non-null   int64  
 14  Parental_Control        3000 non-null   

In [9]:
df.columns

Index(['ID', 'Name', 'Age', 'Gender', 'Location', 'School_Grade',
       'Daily_Usage_Hours', 'Sleep_Hours', 'Academic_Performance',
       'Social_Interactions', 'Exercise_Hours', 'Anxiety_Level',
       'Depression_Level', 'Self_Esteem', 'Parental_Control',
       'Screen_Time_Before_Bed', 'Phone_Checks_Per_Day', 'Apps_Used_Daily',
       'Time_on_Social_Media', 'Time_on_Gaming', 'Time_on_Education',
       'Phone_Usage_Purpose', 'Family_Communication', 'Weekend_Usage_Hours',
       'Addiction_Level'],
      dtype='object')

In [10]:
columns_to_drop = [
    'ID',
    'Name',
    'Location',
    'School_Grade',
    'Gender',
    'Parental_Control',
    'Family_Communication',
    'Phone_Usage_Purpose'
]

df = df.drop(columns=columns_to_drop)

In [11]:
df.columns

Index(['Age', 'Daily_Usage_Hours', 'Sleep_Hours', 'Academic_Performance',
       'Social_Interactions', 'Exercise_Hours', 'Anxiety_Level',
       'Depression_Level', 'Self_Esteem', 'Screen_Time_Before_Bed',
       'Phone_Checks_Per_Day', 'Apps_Used_Daily', 'Time_on_Social_Media',
       'Time_on_Gaming', 'Time_on_Education', 'Weekend_Usage_Hours',
       'Addiction_Level'],
      dtype='object')

In [12]:
df.shape

(3000, 17)

In [13]:
sorted(df["Addiction_Level"].unique())

[np.float64(1.0),
 np.float64(1.4),
 np.float64(2.0),
 np.float64(2.1),
 np.float64(2.2),
 np.float64(2.3),
 np.float64(2.4),
 np.float64(2.6),
 np.float64(2.8),
 np.float64(3.0),
 np.float64(3.1),
 np.float64(3.2),
 np.float64(3.3),
 np.float64(3.4),
 np.float64(3.5),
 np.float64(3.6),
 np.float64(3.7),
 np.float64(3.8),
 np.float64(3.9),
 np.float64(4.0),
 np.float64(4.1),
 np.float64(4.2),
 np.float64(4.3),
 np.float64(4.4),
 np.float64(4.5),
 np.float64(4.6),
 np.float64(4.7),
 np.float64(4.8),
 np.float64(4.9),
 np.float64(5.0),
 np.float64(5.1),
 np.float64(5.2),
 np.float64(5.3),
 np.float64(5.4),
 np.float64(5.5),
 np.float64(5.6),
 np.float64(5.7),
 np.float64(5.8),
 np.float64(5.9),
 np.float64(6.0),
 np.float64(6.1),
 np.float64(6.2),
 np.float64(6.3),
 np.float64(6.4),
 np.float64(6.5),
 np.float64(6.6),
 np.float64(6.7),
 np.float64(6.8),
 np.float64(6.9),
 np.float64(7.0),
 np.float64(7.1),
 np.float64(7.2),
 np.float64(7.3),
 np.float64(7.4),
 np.float64(7.5),
 np.float6

## CREATE TARGET VARIABLE

here the feature engineering 

In [14]:
# def addiction_risk(level):
#     if level <= 5:
#         return 'Low'
#     elif level <= 8:
#         return 'Medium'
#     else:
#         return 'High'

# df['Addiction_Risk'] = df['Addiction_Level'].apply(addiction_risk)
# df['Addiction_Risk'].value_counts()

In [15]:
# df['Addiction_Risk'].value_counts()

In [16]:
# df = df.drop(columns=['Addiction_Level'])

In [17]:
df.head()

Unnamed: 0,Age,Daily_Usage_Hours,Sleep_Hours,Academic_Performance,Social_Interactions,Exercise_Hours,Anxiety_Level,Depression_Level,Self_Esteem,Screen_Time_Before_Bed,Phone_Checks_Per_Day,Apps_Used_Daily,Time_on_Social_Media,Time_on_Gaming,Time_on_Education,Weekend_Usage_Hours,Addiction_Level
0,13,4.0,6.1,78,5,0.1,10,3,8,1.4,86,19,3.6,1.7,1.2,8.7,10.0
1,17,5.5,6.5,70,5,0.0,3,7,3,0.9,96,9,1.1,4.0,1.8,5.3,10.0
2,13,5.8,5.5,93,8,0.8,2,3,10,0.5,137,8,0.3,1.5,0.4,5.7,9.2
3,18,3.1,3.9,78,8,1.6,9,10,3,1.4,128,7,3.1,1.6,0.8,3.0,9.8
4,14,2.5,6.7,56,4,1.1,1,5,1,1.0,96,20,2.6,0.9,1.1,3.7,8.6


In [18]:
df.loc[36:]

Unnamed: 0,Age,Daily_Usage_Hours,Sleep_Hours,Academic_Performance,Social_Interactions,Exercise_Hours,Anxiety_Level,Depression_Level,Self_Esteem,Screen_Time_Before_Bed,Phone_Checks_Per_Day,Apps_Used_Daily,Time_on_Social_Media,Time_on_Gaming,Time_on_Education,Weekend_Usage_Hours,Addiction_Level
36,13,4.4,6.8,62,8,1.4,2,1,10,1.7,66,7,0.9,0.0,0.0,6.9,4.6
37,13,8.3,6.2,65,9,1.3,5,6,8,1.3,144,5,2.3,0.0,1.5,2.9,10.0
38,16,7.4,6.9,67,0,1.7,6,6,8,1.6,117,16,1.6,0.7,0.7,4.6,10.0
39,16,1.7,6.8,68,9,0.8,5,9,1,0.7,107,18,2.4,0.6,0.0,6.2,7.0
40,13,6.2,8.3,77,1,0.4,7,2,4,1.1,141,5,3.8,1.4,0.9,6.2,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,16,3.9,6.4,53,4,0.9,7,10,2,0.3,80,15,2.7,1.8,1.0,9.4,9.8
2996,13,3.6,7.3,93,5,0.0,8,8,9,0.9,45,8,3.1,0.0,0.3,5.2,5.5
2997,14,3.2,6.5,98,1,0.0,4,3,9,0.2,51,13,2.4,0.2,2.4,5.9,6.2
2998,17,6.7,7.5,67,3,0.2,3,5,9,1.6,125,17,1.7,2.6,1.5,6.1,10.0


In [19]:
df.shape

(3000, 17)

In [None]:
#df["Addiction_Level"].value_counts()

In [21]:
df.isnull().sum()

Age                       0
Daily_Usage_Hours         0
Sleep_Hours               0
Academic_Performance      0
Social_Interactions       0
Exercise_Hours            0
Anxiety_Level             0
Depression_Level          0
Self_Esteem               0
Screen_Time_Before_Bed    0
Phone_Checks_Per_Day      0
Apps_Used_Daily           0
Time_on_Social_Media      0
Time_on_Gaming            0
Time_on_Education         0
Weekend_Usage_Hours       0
Addiction_Level           0
dtype: int64

In [22]:
# df['Addiction_Risk'].isnull().sum()

In [23]:
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()
# df['Addiction_Risk'] = le.fit_transform(df['Addiction_Risk'])

In [24]:
X = df.drop(columns=['Addiction_Level'])
y = df['Addiction_Level']

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [27]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train, )
y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Regressor:")
print("MAE:", mean_absolute_error(y_test, y_pred_xgb))
print("R²:", r2_score(y_test, y_pred_xgb))

# model = LogisticRegression(max_iter=1000)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

XGBoost Regressor:
MAE: 0.3340048146247864
R²: 0.9100409585717811


In [28]:
# from sklearn.metrics import accuracy_score

# accuracy = accuracy_score(y_test, y_pred)
# accuracy

In [29]:
# from sklearn.metrics import confusion_matrix

# confusion_matrix(y_test, y_pred)

In [30]:
# from sklearn.metrics import classification_report
# print(classification_report(y_test, y_pred))

In [31]:
# from sklearn.tree import DecisionTreeClassifier
# dt_model=DecisionTreeClassifier(
#     max_depth=5,
#     random_state=42
# )
# dt_model.fit(X_train,y_train)
# dt_pred=dt_model.predict(X_test)
# dt_pred

In [32]:
# from sklearn.metrics import accuracy_score

# dt_accuracy = accuracy_score(y_test, dt_pred)
# dt_accuracy

In [33]:
# from sklearn.ensemble import RandomForestClassifier
# rf_model = RandomForestClassifier(
#     n_estimators=200,
#     max_depth=10,
#     random_state=42,
#     class_weight="balanced"
# )

# rf_model.fit(X_train, y_train)
# rf_pred=rf_model.predict(X_test)
# rf_accuracy = accuracy_score(y_test, rf_pred)
# rf_accuracy

In [34]:
# model_performance = {
#     'Logistic Regression': accuracy,
#     'Decision Tree': dt_accuracy,
#     'Random Forest': rf_accuracy
# }

# model_performance

In [35]:
# best_model = max(model_performance, key=model_performance.get)
# best_model

In [36]:
import pickle

In [37]:
with open("addiction_model.pkl","wb") as file:
    pickle.dump(xgb_model,file)

In [38]:
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

In [39]:
# y.value_counts()