# Child Mind Institute — Problematic Internet Use

## Team notes & log

🔔 <b>REMINDER:</b> Make sure to use GitHub pull before starting work and push when finishing.

<u><b>To Do</b></u>

- Feature engineering
- Include parque data
- CP: Start training the model (Random Forest).
- AG: Visualization

<u><b>Worklog</b></u>

- CP 11/3: Cleaned up some repetitve sections.
- CP 11/5: Merged notebooks with AG & CP's work.
- AG 11/7: Drop unnecessary columns. Dropped a few of them.
- CP 11/11: Filled missing values using KNN, started training the model using Random Forest, calculated scores.
- CP 11/17: Fixed submission error. Result: 0.
- CP 11/17: Imported parquet data.

<u><b>Submissions and Results</b></u>

- CP 11/17: Random Forest test submission: 0


## Preprocessing


### CSV Data


In [None]:
# Import libraries

# CP
import numpy as np
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import os
from tqdm import tqdm

#### AG 11/2/24 ###
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
# CP: The following code is from Kaggle notebook:
# https://www.kaggle.com/code/cchangyyy/0-490-notebook

# Processes a parquet file


def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, "part-0.parquet"))
    df.drop("step", axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split("=")[1]


def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)

    with ThreadPoolExecutor() as executor:
        results = list(
            tqdm(
                executor.map(lambda fname: process_file(fname, dirname), ids),
                total=len(ids),
            )
        )

    stats, indexes = zip(*results)

    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df["id"] = indexes
    return df

In [None]:
# CP: Load data
# CP: Check if you are running in Kaggle or locally
# CP: Running locally
if os.path.exists("kaggle_data"):
    train = pd.read_csv("kaggle_data/train.csv")
    test = pd.read_csv("kaggle_data/test.csv")
    data_dict = pd.read_csv("kaggle_data/data_dictionary.csv")
    train_ts = load_time_series("kaggle_data/series_train.parquet")
    test_ts = load_time_series("kaggle_data/series_test.parquet")

# CP: Running in Kaggle
else:
    train = pd.read_csv(
        "/kaggle/input/child-mind-institute-problematic-internet-use/train.csv"
    )
    test = pd.read_csv(
        "/kaggle/input/child-mind-institute-problematic-internet-use/test.csv"
    )
    data_dict = pd.read_csv(
        "/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv"
    )
    train_ts = load_time_series(
        "/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet"
    )
    test_ts = load_time_series(
        "/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet"
    )

# CP: Show all rows when displaying data
pd.set_option("display.max_rows", None)

# CP: Display data dictionary
data_dict

In [None]:
# CP: The following code is from Kaggle notebook:
# https://www.kaggle.com/code/cchangyyy/0-490-notebook

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on="id")
test = pd.merge(test, test_ts, how="left", on="id")

train = train.drop("id", axis=1)
test = test.drop("id", axis=1)

In [None]:
#### AG 11/2/24 ###
train.head(20)

In [None]:
# There are null values in every column except for "age" , "sex", and "basic season"
# sii: Severity Impairment Index: Range (0-3)
train.describe().transpose()

In [None]:
####AG###

#### Notes ####

## Children Global Assessment Scale(CGAS) score is used to check the general
# functioning level of children typically in the range of 1-100
# CGAS score has max value 999 which might be an outlier.

##PCIAT-PCIAT_Total = Parent-Child Internet Addiction Test
# the score reflects addiction or PIU, which might be 0 if no problematic behavior is found
##SDS-SDS_Total_Raw: Self-Rating Depression , raw scores can range from: 20-80
# min for this is 38 in the table which means even lowest
# scoring children may be experiencing mild depressive symptoms
##PreInt_EduHx-computerinternet_hoursday : avg number of hours per day spent using internet

In [None]:
####AG###

## Report of the extensive missing values ##

# Fitness_Endurance-Max_Stage: 743 non-null (81% missing)
# Fitness_Endurance-Time_Mins: 740 non-null (81% missing)
# Fitness_Endurance-Time_Sec: 740 non-null (81% missing)
# Physical-Waist_Circumference: 898 non-null (77% missing)
# BIA-Activity_Level_num: 1991 non-null (50% missing)
# BIA-BMC: 1991 non-null (50% missing)
# BIA-BMI: 1991 non-null (50% missing)
# BIA-BMR: 1991 non-null (50% missing)
# BIA-DEE: 1991 non-null (50% missing)
# BIA-ECW: 1991 non-null (50% missing)
# PAQ_A-Season: 475 non-null (88% missing)
# PAQ_A-PAQ_A_Total: 475 non-null (88% missing)
# PAQ_C-Season: 1721 non-null (57% missing)
# PAQ_C-PAQ_C_Total: 1721 non-null (57% missing)
# PCIAT-Season: 2736 non-null (31% missing) with multiple related PCIAT columns also having missing data.
# SDS-Season: 2618 non-null (34% missing)
# SDS-SDS_Total_Raw: 2609 non-null (35% missing)
# SDS-SDS_Total_T: 2606 non-null (35% missing)

# CP: Check for missing/null values
train.isnull().sum()

In [None]:
# CP: Explore data
train.info()

In [None]:
# CP: Explore data
train.shape

In [None]:
# CP: Check target values
train["sii"].value_counts()

In [None]:
# CP: Check missing target values
train["sii"].isnull().sum()

In [None]:
# CP: Drop any rows where target value is missing
# since they cannot be used for training.
train.dropna(subset=["sii"], inplace=True)

In [None]:
# CP: Recheck missing target values
train["sii"].isnull().sum()

In [None]:
# CP: Check for duplicates
train.duplicated().sum()

In [None]:
train.shape

In [None]:
# 11/7/24 AG ###
# checking columns with more than 50% of the data missing
# CP: Adjusted to 64% due to parquet data
missing_percentage = train.isnull().mean() * 100
columns_with_missing_data = missing_percentage[missing_percentage > 64]
columns_with_missing_data

In [None]:
# Checking the correlation of the numeric columns with sii
numerical_columns = train.select_dtypes(include=["float64", "int64"]).columns

# No sii in  the numerical columns to avoid correlating it with itself
numerical_columns = numerical_columns[numerical_columns != "sii"]
correlation = train[numerical_columns].corrwith(train["sii"])

correlation

In [None]:
train.isnull().sum()

In [None]:
# Check the missing percentage for specific columns
columns_of_interest = [
    "PAQ_A-Season",
    "PAQ_A-PAQ_A_Total",
    "Physical-Waist_Circumference",
    "Fitness_Endurance-Max_Stage",
    "Fitness_Endurance-Time_Mins",
    "Fitness_Endurance-Time_Sec",
]

# percentage of missing values in these columns
missing_percentage = train[columns_of_interest].isnull().mean() * 100
print("Missing percentage:")
print(missing_percentage)

numerical_columns = (
    train[columns_of_interest].select_dtypes(include=["float64", "int64"]).columns
)

# Calculate correlation of the numerical columns with the target variable 'sii'
correlation_with_sii = train[numerical_columns].corrwith(train["sii"])

print("\nCorrelation with sii:")
print(correlation_with_sii)

In [None]:
# creating a backup
backup_train = train.copy()

In [None]:
# # Dropping columns that have null values more than 50% and weak sii connection
# train.drop(columns=columns_of_interest, inplace=True)
# test.drop(columns=columns_of_interest, inplace=True)
# train.isnull().sum()

In [None]:
# train = train.fillna(0) # replacing null values with 0 for uniformity

In [None]:
# 11/9/24   AG ####

# Further cleaning process ##

# Fitness_Endurance-Season (1,476 missing)
# FGC-FGC_GSND (1,864 missing)
# FGC-FGC_GSND_Zone (1,872 missing)
# FGC-FGC_GSD (1,865 missing)
# FGC-FGC_GSD_Zone (1,872 missing)
# PAQ_C-Season (1,296 missing)
# PAQ_C-PAQ_C_Total (1,296 missing)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical features
train["Fitness_Endurance-Season"] = LabelEncoder().fit_transform(
    train["Fitness_Endurance-Season"].astype(str)
)
train["PAQ_C-Season"] = LabelEncoder().fit_transform(train["PAQ_C-Season"].astype(str))

selected_features = [
    "Fitness_Endurance-Season",
    "FGC-FGC_GSND",
    "FGC-FGC_GSND_Zone",
    "FGC-FGC_GSD_Zone",
    "FGC-FGC_GSD",
    "PAQ_C-Season",
    "PAQ_C-PAQ_C_Total",
]

correlation = train[selected_features].corrwith(train["sii"])

print("\nCorrelation with sii:")
print(correlation)

In [None]:
# Notes #

# Fitness_Endurance-Season
# Meaning: Measures endurance levels during a specific season (e.g., Fall, Winter).
# Relation to PIU: People who exercise more or have better endurance might spend less time on the internet. However, this feature doesn’t seem to impact PIU much.
# Missing Values: 1,476
# Correlation: -0.097 (weak negative relationship) - so decided to drop it


# FGC-FGC_GSND
# Meaning: Some specific fitness or physical activity score (GSND).
# Relation to PIU: Higher physical fitness might indicate better lifestyle habits and less internet use.
# Missing Values: 1,864
# Correlation: 0.258 (moderate positive relationship)- Keep it

# FGC-FGC_GSND_Zone
# Meaning: A category or performance zone for the GSND score (e.g., low, medium, high).
# Relation to PIU: Similar to FGC-FGC_GSND, categorizing fitness levels may help identify those at risk of PIU.
# Missing Values: 1,872
# Correlation: 0.226 (moderate positive relationship)
# overlap with FGC-FGC_GSND?? Not sure

# FGC-FGC_GSD
# Meaning: Another specific physical fitness score (GSD).
# Relation to PIU: Indicates general fitness levels, which might help reduce internet use.
# Missing Values: 1,865
# Correlation: 0.261 (moderate positive relationship)
# Keeping it

# FGC-FGC_GSD_Zone
# Meaning: Zone or category for GSD (e.g., fitness performance level).
# Relation to PIU: Helps categorize fitness levels and may highlight patterns in PIU.
# Missing Values: 1,872
# Correlation: 0.229 (moderate positive relationship)
# Similar to FGC-FGC_GSD. could overlap?

# PAQ_C-Season
# Meaning: Seasonal data from a physical activity questionnaire (PAQ).
# Relation to PIU: Physical activity might vary with seasons, which could affect internet habits.
# Missing Values: 1,296
# Correlation: 0.085 (weak positive relationship)
#

# PAQ_C-PAQ_C_Total
# Meaning: Total score from the PAQ, summarizing physical activity levels.
# Relation to PIU: Higher physical activity may reduce time spent online, improving lifestyle balance.
# Missing Values: 1,296
# Correlation: 0.076 (weak positive relationship)
# Weak relation to PIU.

In [None]:
columns_to_drop = ["Fitness_Endurance-Season", "PAQ_C-Season", "PAQ_C-PAQ_C_Total"]
train = train.drop(columns=columns_to_drop)
test = test.drop(columns=columns_to_drop)

print(train.info())

In [None]:
# Checking features in  a corr matrix that might have overlap with each other

overlap_features = [
    "FGC-FGC_GSND",
    "FGC-FGC_GSND_Zone",
    "FGC-FGC_GSD",
    "FGC-FGC_GSD_Zone",
]

correlation_matrix = train[overlap_features].corr()
print("Correlation Matrix:")
print(correlation_matrix)

In [None]:
# Can see a high correlation
# FGC-FGC_GSND and FGC-FGC_GSD are providing nearly the same information.
# Similarly, FGC-FGC_GSND_Zone and FGC-FGC_GSD_Zone

# train = train.drop(columns=['FGC-FGC_GSND','FGC-FGC_GSND_Zone'])

In [None]:
train = train.drop(
    columns=[
        "Fitness_Endurance-Max_Stage",
        "Fitness_Endurance-Time_Mins",
        "Fitness_Endurance-Time_Sec",
    ]
)  # had forgotton to drop these attributes with highest null values

test = test.drop(
    columns=[
        "Fitness_Endurance-Max_Stage",
        "Fitness_Endurance-Time_Mins",
        "Fitness_Endurance-Time_Sec",
    ]
)

In [None]:
train.info()

In [None]:
# train['PAQ_A-Season'].head(50)
train = train.drop(columns=["PAQ_A-Season"])
test = test.drop(columns=["PAQ_A-Season"])

In [None]:
# Working on categorical columnns

categorical_columns = [
    'Basic_Demos-Enroll_Season',
    'CGAS-Season',
    'Physical-Season',
    'FGC-Season',
    'BIA-Season',
    'PCIAT-Season',
    'SDS-Season',
    'PreInt_EduHx-Season'
]
# Replace 0 values with Unknown for categorical attributes
#train[categorical_columns] = train[categorical_columns].replace(0, 'Unknown')

train[categorical_columns].head(50)

In [None]:
# train = train.drop(columns=["id"])

In [None]:
# Plotting boxplots for sii  against each categorical column

plt.figure(figsize=(16, 24))
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(4, 2, i)
    sns.boxplot(x=col, y="sii", data=train)
    plt.xticks(rotation=45)
    plt.title(f"'sii' vs {col}")
plt.tight_layout()
plt.show()

In [None]:
# CP: Drop columns with season data
train = train.drop(columns=train.filter(regex="Season$").columns)
test = test.drop(columns=test.filter(regex="Season$").columns)
train.info()

In [None]:
train.head(20)

### Parquet data

#### TO DO: Deal with parque data BEFORE using KNN


In [None]:
# CP: Fill in missing values using KNN
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
train_filled = imputer.fit_transform(train)
train = pd.DataFrame(train_filled, columns=train.columns)
train.isnull().sum().sum()

In [None]:
train.head()

In [None]:
train.shape, test.shape

In [None]:
# Ensure the test dataframe has the same columns as the training dataframe
test_no_id = test.reindex(columns=train.columns, fill_value=0)

# Apply the imputer to the test dataframe
test_filled = imputer.transform(test_no_id)
test_backup = test.copy()
test = pd.DataFrame(test_filled, columns=test_no_id.columns)

In [None]:
train.shape, test.shape

## Training the model

Portions of Random Forest code used from class.


In [None]:
# CP: Import the necessary libraries
from sklearn.metrics import roc_auc_score

In [None]:
# Split the data into training and test sets
X = train.drop('sii', axis=1).copy()
y = train['sii'].copy()

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Number of samples and features
num_features = X_train.shape[1]
num_samples = X_train.shape[0]
num_samples, num_features

In [None]:
# Import libraries
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import f1_score

### Pipelines


In [None]:
# Used pipelines to compare models and reduce code
pl_lr = Pipeline([('log_reg', LogisticRegression(solver='liblinear', max_iter=1000))])
pl_rf = Pipeline([('rf', RandomForestClassifier())])
pl_gnb = Pipeline([('gnb', GaussianNB())])
pl_svm = Pipeline([('svm', svm.SVC(kernel='linear'))])

pipelines = [pl_lr, pl_rf, pl_gnb, pl_svm]
pipe_dict = {0: 'Logistic Regression', 1: 'Random Forest', 2: 'Naive Bayes', 3: 'SVM'}

for pipe in pipelines:
    pipe.fit(X_train, y_train)

for i, model in enumerate(pipelines):
    pred_test = model.predict(X_test)
    print("{} Test F1: {}".format(pipe_dict[i], f1_score(y_test, pred_test)))
    print(classification_report(y_test, pred_test))
    metrics.confusion_matrix(y_test, pred_test)


In [None]:
# clf = RandomForestClassifier(warm_start=True, 
#                                oob_score=True,
#                                min_samples_leaf=40,
#                                max_depth = 10,
#                                random_state=42)

# error_rate = {}

# # Range of n_estimators values to explore.
# min_estimators = 80
# max_estimators = 500

# for i in range(min_estimators, max_estimators + 1):
#     clf.set_params(n_estimators=i) 
#     clf.fit(X_train.values, y_train.values)

#     # Record the OOB error for each `n_estimators=i` setting.
#     oob_error = 1 - clf.oob_score_
#     error_rate[i] = oob_error

In [None]:
# # Generate the "OOB error rate" vs. "n_estimators" plot.
# # OOB error rate = num_missclassified/total observations (%)\
# xs = []
# ys = []
# for label, clf_err in error_rate.items():
#     xs.append(label)
#     ys.append(clf_err)   
# plt.plot(xs, ys)
# plt.xlim(min_estimators, max_estimators)
# plt.xlabel("n_estimators")
# plt.ylabel("OOB error rate")
# plt.show();


In [None]:
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# # Get scores (accuracy, precision, recall, f1, roc-auc)
# y_pred = clf.predict(X_train.values)
# y_pred_proba = clf.predict_proba(X_train.values)

# accuracy = accuracy_score(y_train, y_pred)
# precision = precision_score(y_train, y_pred, average='weighted', zero_division=1)
# recall = recall_score(y_train, y_pred, average='weighted')
# f1 = f1_score(y_train, y_pred, average='weighted')
# roc_auc = roc_auc_score(y_train, y_pred_proba, multi_class='ovr')

# # Print the scores
# print('Accuracy: ', accuracy)
# print('Precision: ', precision)
# print('Recall: ', recall)
# print('F1: ', f1)
# print('ROC-AUC:', roc_auc)

In [None]:
# from sklearn import metrics
# class_names=[0,1]
# fig, ax = plt.subplots()
# tick_marks = np.arange(len(class_names))
# plt.xticks(tick_marks, class_names)
# plt.yticks(tick_marks, class_names)
# cnf_matrix = metrics.confusion_matrix(y_train, y_pred)
# sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
# ax.xaxis.set_label_position("top")
# plt.tight_layout()
# plt.title('Confusion matrix', y=1.1)
# plt.ylabel('Actual label')
# plt.xlabel('Predicted label')

In [None]:
# # Get feature importances
# columns = train.columns

# importances = clf.feature_importances_
# indices = np.argsort(importances)[::-1]

# # print(importances)
# # print(indices)

# # Summarize feature importances
# print("Feature ranking:")
# for f in range(X_train.shape[1]):
#     print(f"{f + 1}. feature {columns[indices[f]]} ({importances[indices[f]]:.3f})")

# # Plot the feature importances of the forest
# # plt.figure()
# # plt.title("Feature importances")
# # plt.bar(range(X_train.shape[1]), importances[indices], color="r", align="center")
# # plt.xticks(range(X_train.shape[1]), [columns[i] for i in indices], rotation=90)
# # plt.xlim([-1, X_train.shape[1]])
# # plt.show()

In [None]:
# # Make sure train and test sets have the same columns
# train_columns = X_train.columns
# test_columns = test.columns
# missing_columns = set(train_columns) - set(test_columns)
# missing_columns

In [None]:
# # Make sure train and test match
# test_copy = test.reindex(columns=X_train.columns)

In [None]:
# # Generate predictions for the test set
# y_test_pred = clf.predict(test_copy.values)

# # Convert to int
# y_test_pred = y_test_pred.astype(int)

In [None]:
# y_test_pred

In [None]:
# # Export CSV file with predictions
# output = pd.DataFrame({'id': test_backup['id'], 'sii': y_test_pred})
# output.to_csv('submission.csv', index=False)