# Loading The Data Sets


In [None]:
# Import libraries
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import missingno as msno

In [None]:
# Cite: https://www.kaggle.com/code/cchangyyy/0-490-notebook


# Processes a parquet file
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, "part-0.parquet"))
    df.drop("step", axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split("=")[1]


def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(
            tqdm(
                executor.map(lambda fname: process_file(fname, dirname), ids),
                total=len(ids),
            )
        )
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df["id"] = indexes
    return df

In [None]:
# Loading the train and test data sets
# Aligning train dataset columns to match test dataset
# Checking for column differences between train and test datasets

# CP: Running locally
if os.path.exists("kaggle_data"):
    train_data = pd.read_csv("kaggle_data/train.csv")
    test_data = pd.read_csv("kaggle_data/test.csv")
    train_ts = load_time_series("kaggle_data/series_train.parquet")
    test_ts = load_time_series("kaggle_data/series_test.parquet")

# CP: Running in Kaggle
else:
    train_data = pd.read_csv(
        "/kaggle/input/child-mind-institute-problematic-internet-use/train.csv"
    )
    test_data = pd.read_csv(
        "/kaggle/input/child-mind-institute-problematic-internet-use/test.csv"
    )
    train_ts = load_time_series(
        "/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet"
    )
    test_ts = load_time_series(
        "/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet"
    )


column_names = list(test_data.columns)
target = train_data["sii"]
train_data = pd.DataFrame(train_data, columns=column_names)
train_data["sii"] = target
print(train_data.columns.difference(test_data.columns))
print(train_data.shape)
print(test_data.shape)

In [None]:
# Cite: https://www.kaggle.com/code/cchangyyy/0-490-notebook

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")
train_data = pd.merge(train_data, train_ts, how="left", on="id")
test_data = pd.merge(test_data, test_ts, how="left", on="id")
train_data = train_data.drop("id", axis=1)

In [None]:
# # Make a copy (comment on/off)
# train_data_copy = train_data.copy()
# test_data_copy = test_data.copy()

In [None]:
# Display all rows
pd.set_option("display.max_rows", None)

# Data Visualization


In [None]:
plt.hist(train_data["sii"].dropna(), bins=4, edgecolor="k")
plt.title("Distribution of SII")
plt.xlabel("SII")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Missing SII data

sii_val = train_data["sii"].value_counts().sum()
sii_missing = train_data["sii"].isnull().sum()
total_rows = train_data.shape[0]

print(f"Total number of SII values: {sii_val}")
print(f"Total number of missing SII values: {sii_missing}")
print(f"Percentage of missing SII values: {sii_missing/total_rows*100:.2f}%")

In [None]:
# Missing csv data
missing_csv = (
    train_data.loc[:, ~train_data.columns.str.startswith("stat_")].isnull().sum().sum()
)
print("Total missing csv data: ", missing_csv)
print("Percentage of missing csv data: ", missing_csv / train_data.size * 100)
train_data.loc[
    :, ~train_data.columns.str.startswith("stat_")
].isnull().sum().sort_values(ascending=False)

In [None]:
# Missing parquet data
missing_parquet = (
    train_data.loc[:, train_data.columns.str.startswith("stat_")].isnull().sum().sum()
)
print("Total missing parquet data: ", missing_parquet)
print("Percentage of missing parquet data: ", missing_parquet / train_data.size * 100)
train_data.loc[
    :, train_data.columns.str.startswith("stat_")
].isnull().sum().sort_values(ascending=False)

In [None]:
df_cat = train_data.select_dtypes(include=["object"])
df_num = train_data.select_dtypes(include=["int64", "float64"])

In [None]:
# Data visualization for numerical data
for i in df_num.columns:
    unique_values = df_num[i].dropna().unique()
    # Check for binary data
    if len(unique_values) == 2:
        sns.countplot(x=df_num[i])
        plt.title(f"Binary Distribution of {i}")
    else:
        plt.hist(df_num[i])
        plt.title(f"Distribution of {i}")
    plt.show()

In [None]:
# Data visualization for categorical data
for i in df_cat.columns:
    sns.barplot(x=df_cat[i].value_counts().index, y=df_cat[i].value_counts()).set_title(
        i
    )
    plt.show()

# Data Preprocessing


In [None]:
# Dropping the 'id' column from both train and test datasets
ids = test_data["id"]
test_data = test_data.drop("id", axis=1)

# Apply one-hot encoding to categorical columns
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

# Align train and test datasets to ensure they have the same columns
# Missing columns will be added with NaN values
train_data, test_data = train_data.align(test_data, join="outer", axis=1)

# Fill missing values with 0 (useful for alignment step)
train_data.fillna(value=0, inplace=True)
test_data.fillna(value=0, inplace=True)

# Print the shapes of the datasets to verify alignment
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

# Display dataset info to inspect column data types and memory usage
print("\nTrain data info:")
print(train_data.info())
print("\nTest data info:")
print(test_data.info())

# Check for any column differences
difference = train_data.columns.difference(test_data.columns)
if difference.empty:
    print("No column differences between train and test datasets.")
else:
    print("Column differences found:", difference)

# Remove target column 'sii' from test dataset, as it's not available for predictions
test_data = test_data.drop(columns=["sii"], errors="ignore")

# Display final column details
print("\nFinal train columns:", train_data.columns)
print("\nFinal test columns:", test_data.columns)

# Random Forest Model Predictions


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

# Prepare features and target
X = train_data.drop(columns=["sii"])
y = train_data["sii"]

# Scale features
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Apply SMOTE for class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.1, random_state=42
)

# Initialize Random Forest with adjusted parameters
RFC = RandomForestClassifier(
    n_estimators=150,  # Reduced number of trees for faster evaluation
    max_depth=10,  # Lower depth for simpler trees
    min_samples_split=10,  # Increase minimum samples required to split
    min_samples_leaf=5,  # Require more samples in leaf nodes
    class_weight="balanced",  # Emphasize minority classes
    random_state=42,
)

# Fit the model
RFC.fit(X_train, y_train)

# Predict on train and test data
y_pred_test = RFC.predict(X_test)
y_pred_train = RFC.predict(X_train)

# Evaluate model
print(
    "Testing data: Model accuracy score : {0:0.4f}".format(
        accuracy_score(y_test, y_pred_test) * 100
    )
)
print(
    "Training data: Model accuracy score : {0:0.4f}".format(
        accuracy_score(y_train, y_pred_train) * 100
    )
)

# Classification report for detailed metrics
print("\nClassification Report for Testing Data:")
print(classification_report(y_test, y_pred_test))

# Cross-validation scores for model stability
cv_scores = cross_val_score(RFC, X_resampled, y_resampled, cv=5, scoring="accuracy")
print("\nCross-validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy: {0:0.4f}".format(np.mean(cv_scores)))

# Feature Importance
feature_importances = pd.DataFrame(
    {
        "Feature": train_data.drop(columns=["sii"]).columns,
        "Importance": RFC.feature_importances_,
    }
).sort_values(by="Importance", ascending=False)

print("\nTop Features by Importance:")
print(feature_importances.head(10))

In [None]:
# Get feature importances
columns = train_data.drop(columns=["sii"]).columns

importances = RFC.feature_importances_
indices = np.argsort(importances)[::-1]

# # Summarize feature importances
# print("Feature ranking:")
# for f in range(X_train.shape[1]):
#     print(f"{f + 1}. feature {columns[indices[f]]} ({importances[indices[f]]:.3f})")

# Plot the feature importances
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(X_train.shape[1]), [columns[i] for i in indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()

In [None]:
# Plot ROC curve
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize

# Binarize the output
y_train_bin = label_binarize(y_train, classes=[0, 1, 2, 3])

# Use OneVsRestClassifier for multiclass ROC curve
classifier = OneVsRestClassifier(RFC)
y_pred_train = classifier.fit(X_train, y_train_bin).predict_proba(X_train)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(y_train_bin.shape[1]):
    fpr[i], tpr[i], _ = roc_curve(y_train_bin[:, i], y_pred_train[:, i])
    roc_auc[i] = roc_auc_score(y_train_bin[:, i], y_pred_train[:, i])

# Plot ROC curve for each class
plt.figure(figsize=(8, 6))
colors = ["aqua", "darkorange", "cornflowerblue", "green"]
for i, color in zip(range(y_train_bin.shape[1]), colors):
    plt.plot(
        fpr[i],
        tpr[i],
        color=color,
        lw=2,
        label=f"ROC curve of class {i} (area = {roc_auc[i]:.2f})",
    )

plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn import metrics

# Plot confusion matrix
y_pred = RFC.predict(X_test)
conf_matrix = metrics.confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 7))
sns.heatmap(
    conf_matrix,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=[0, 1, 2, 3],
    yticklabels=[0, 1, 2, 3],
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Random Forest Predictions On Test Set


In [None]:
X = test_data
X = scaler.fit_transform(X)
y_pred = RFC.predict(X)

submission = pd.DataFrame({"id": ids, "sii": y_pred.astype(int)})
print(submission)

submission.to_csv("submission.csv", index=False)
print("Submission file created.")

# Logistic Regression Model Predictions


In [None]:
from sklearn.linear_model import LogisticRegression

X = train_data.drop(columns=["sii"])
y = train_data["sii"]

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

LR = LogisticRegression()
LR.fit(X_train, y_train)

y_pred_test = LR.predict(X_test)
y_pred_train = RFC.predict(X_train)

print(
    "Testing data accuracy: {0:0.4f}".format(accuracy_score(y_test, y_pred_test) * 100)
)
print(
    "Training data accuracy: {0:0.4f}".format(
        accuracy_score(y_train, y_pred_train) * 100
    )
)

# Logistic Regression Predictions On Test Set


In [None]:
# Predicting on test data
# X = test_data
# X = scaler.fit_transform(X)
# y_pred = LR.predict(X)

# submission = pd.DataFrame({
#     'id': ids,
#     'sii': y_pred.astype(int)
# })

# #saving to CSV
# submission.to_csv('submission.csv', index=False)
# print(submission)