# 1. Problem Definition and Dataset Selection

Problem: To predict the addiction level of a person to social media.

In [2]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import pandas as pd

# Load data
filename = 'Time-Wasters on Social Media.csv'
data = read_csv(filename)

# 2. Data Preprocessing and Feature Engineering

Remove Irrelevant Features

In [3]:
# Define preprocessing functions
def drop_columns(data):
    return data.drop(columns=["UserID", "Video ID"])

Feature Transformation Tasks:

1) Transform 'Watch Time'

2) Bin the target variable 'Addiction Level'

Transform Watch Time into 2 features. Hour and Time Period

In [4]:
def convert_to_24_hour(data):
    data = data.copy()
    data["Hour"] = pd.to_datetime(data["Watch Time"], format="%I:%M %p").dt.hour
    data = data.drop(columns=["Watch Time"])
    return data

In [5]:
def add_time_period(data):
    def time_period(hour):
        if 6 <= hour < 12:
            return "Morning"
        elif 12 <= hour < 18:
            return "Afternoon"
        elif 18 <= hour < 24:
            return "Evening"
        else:
            return "Night"
    data["Time Period"] = data["Hour"].apply(time_period)
    return data

Feature Transformation - binning the target variable

In [6]:
def bin_addiction_level(data):
    bins = [-1, 2, 5, 7]
    labels = ["Low", "Moderate", "High"]
    data["Addiction Level Binned"] = pd.cut(data["Addiction Level"], bins=bins, labels=labels)
    return data

split into numerical and categorical features

In [7]:
def split_features(data):
    """Define numerical and categorical feature sets."""
    numerical_features = ['Age', 'Income', 'Total Time Spent', 'Number of Sessions',
                          'Video Length', 'Engagement', 'Importance Score', 'Time Spent On Video',
                          'Number of Videos Watched', 'Scroll Rate', 'ProductivityLoss',
                          'Satisfaction', 'Self Control', 'Hour']

    categorical_features = ['Gender', 'Location', 'Debt', 'Owns Property',
                            'Profession', 'Demographics', 'Platform', 'Video Category', 'Watch Reason',
                            'DeviceType', 'OS', 'CurrentActivity',
                            'ConnectionType', 'Time Period', 'Addiction Level Binned']

    return numerical_features, categorical_features

In [8]:
def convert_booleans_to_category(data):
    """Convert boolean columns to 'category' dtype."""
    data = data.copy()
    boolean_columns = ['Debt', 'Owns Property']
    for column in boolean_columns:
        data[column] = data[column].astype('category')
    return data

Standardization for numerical features

In [9]:
from sklearn.preprocessing import MinMaxScaler

def scale_numerical_features(data):
    """Scale numerical features using Min-Max Scaling."""
    data = data.copy()  # Ensure original data is not modified
    numerical_features, _ = split_features(data)  # Get numerical features
    scaler = MinMaxScaler()  # Initialize Min-Max Scaler
    data[numerical_features] = scaler.fit_transform(data[numerical_features])  # Scale the features
    return data


Add Preprocessing and feature engineering steps into a pipeline

In [10]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

# Preprocessing pipeline
pipeline = Pipeline([
    ('drop_columns', FunctionTransformer(drop_columns, validate=False)),
    ('convert_to_24_hour', FunctionTransformer(convert_to_24_hour, validate=False)),
    ('add_time_period', FunctionTransformer(add_time_period, validate=False)),
    ('bin_addiction_level', FunctionTransformer(bin_addiction_level, validate=False)),
    ('scale_numerical_features', FunctionTransformer(scale_numerical_features, validate=False)),
])

# Apply the preprocessing pipeline to the data
data = pipeline.fit_transform(data)

pipeline

Feature Selection

In [11]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: graphviz, catboost
Successfully installed catboost-1.2.7 graphviz-0.20.3


In [12]:
from catboost import CatBoostClassifier  # Or CatBoostRegressor for regression tasks

X = data.drop(columns=['Addiction Level Binned'])  # Features
y = data['Addiction Level Binned']  # Target variable

# Identify categorical columns (you can customize this based on your data)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Initialize CatBoostClassifier (use CatBoostRegressor for regression tasks)
model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, cat_features=categorical_cols, verbose=0)

# Fit the model
model.fit(X, y)

# Get feature importances
feature_importances = model.get_feature_importance()

# Create a DataFrame to display features and their importances
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort the features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance
print(feature_importance_df)

threshold = 0.1
selected_features = feature_importance_df[feature_importance_df['Importance'] > threshold]

print("\nSelected Features:")
print(selected_features)


                     Feature  Importance
24              Self Control   40.043899
19          ProductivityLoss   21.340283
20              Satisfaction   19.394508
25           Addiction Level   16.762343
29               Time Period    0.357536
18                 Frequency    0.324324
26           CurrentActivity    0.316693
28                      Hour    0.282673
21              Watch Reason    0.199461
15       Time Spent On Video    0.140099
17               Scroll Rate    0.135851
10        Number of Sessions    0.121669
0                        Age    0.115364
8                   Platform    0.090354
3                     Income    0.068189
13                Engagement    0.056251
12              Video Length    0.054526
1                     Gender    0.049831
14          Importance Score    0.048004
16  Number of Videos Watched    0.029341
6                 Profession    0.022836
22                DeviceType    0.014692
9           Total Time Spent    0.013142
4               

Drop Hour as Time Period is deem more important out of the 2 engineered features

In [13]:
selected_features

Unnamed: 0,Feature,Importance
24,Self Control,40.043899
19,ProductivityLoss,21.340283
20,Satisfaction,19.394508
25,Addiction Level,16.762343
29,Time Period,0.357536
18,Frequency,0.324324
26,CurrentActivity,0.316693
28,Hour,0.282673
21,Watch Reason,0.199461
15,Time Spent On Video,0.140099


In [14]:
# Drop the row where 'Feature' is 'Hour'
selected_features = selected_features[selected_features['Feature'] != 'Hour']

# Display the updated DataFrame
selected_features

Unnamed: 0,Feature,Importance
24,Self Control,40.043899
19,ProductivityLoss,21.340283
20,Satisfaction,19.394508
25,Addiction Level,16.762343
29,Time Period,0.357536
18,Frequency,0.324324
26,CurrentActivity,0.316693
21,Watch Reason,0.199461
15,Time Spent On Video,0.140099
17,Scroll Rate,0.135851


Encoding categorical features using one-hot encoding

In [15]:
# Select only categorical or object columns from selected_features
categorical_columns = selected_features.select_dtypes(include=['object', 'category']).columns.tolist()

# Apply one-hot encoding to the categorical columns
selected_features_encoded = pd.get_dummies(selected_features, columns=categorical_columns)

# Display the encoded features
print(selected_features_encoded.head())


    Importance  Feature_Addiction Level  Feature_Age  Feature_CurrentActivity  \
24   40.043899                    False        False                    False   
19   21.340283                    False        False                    False   
20   19.394508                    False        False                    False   
25   16.762343                     True        False                    False   
29    0.357536                    False        False                    False   

    Feature_Frequency  Feature_Number of Sessions  Feature_ProductivityLoss  \
24              False                       False                     False   
19              False                       False                      True   
20              False                       False                     False   
25              False                       False                     False   
29              False                       False                     False   

    Feature_Satisfaction  Feature_Scro

# 3. Baseline Models

Logistic Regression

In [17]:
from sklearn.model_selection import train_test_split
# Assuming 'X' contains your features and 'y' contains your target variable
# from the feature selection step
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust test_size and random_state as needed

# Logistic Regression
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
print("\nLogistic Regression Results:")
print("Classification Report:")
print(classification_report(y_test, y_pred_logistic))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic))

ValueError: could not convert string to float: 'Male'

Linear Discriminant Analysis

In [18]:
# Linear Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
y_pred_lda = lda_model.predict(X_test)
print("\nLinear Discriminant Analysis Results:")
print("Classification Report:")
print(classification_report(y_test, y_pred_lda))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lda))

ValueError: could not convert string to float: 'Male'

K-Nearest Neighbors

In [None]:
# k-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
print("\nk-Nearest Neighbors Results:")
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))