
# Enhanced Data Loading and Processing
This notebook processes datasets stored in `.parquet` format. 
Key enhancements include:
- Robust error handling in the `process_file` function.
- Parallelized data loading for efficiency.
- Visualization of key patterns using heatmaps and distributions.


In [1]:
# Load the libraries

import numpy as np

import pandas as pd

import os

import seaborn as sns

import matplotlib.pyplot as plt

from pandas.plotting import andrews_curves

from sklearn.impute import SimpleImputer

from concurrent.futures import ThreadPoolExecutor

from tqdm import tqdm

In [2]:

def process_file(filename, dirname):
    """
    Processes a given parquet file by loading, cleaning, and transforming it.
    Parameters:
        filename (str): Name of the file to process.
        dirname (str): Directory containing the file.
    Returns:
        pd.DataFrame: Processed DataFrame.
    """
    try:
        file_path = os.path.join(dirname, filename)
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File {file_path} does not exist.")
        
        df = pd.read_parquet(file_path)
        
        # Drop unnecessary columns
        if 'step' in df.columns:
            df.drop(columns='step', inplace=True)
        
        # Handle missing values using SimpleImputer
        imputer = SimpleImputer(strategy='mean')
        df_cleaned = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
        
        return df_cleaned

    except Exception as e:
        print(f"Error processing file {filename}: {e}")
        return None
        

In [None]:
# CP: Load data

# CP: Check if you are running in Kaggle or locally



# CP: Running locally

if os.path.exists("kaggle_data"):

    train_data = pd.read_csv("kaggle_data/train.csv")

    test_data = pd.read_csv("kaggle_data/test.csv")

    data_dict = pd.read_csv("kaggle_data/data_dictionary.csv")

    train_ts = load_time_series("kaggle_data/series_train.parquet")

    test_ts = load_time_series("kaggle_data/series_test.parquet")



# CP: Running in Kaggle

else:

    train_data = pd.read_csv(

        "/kaggle/input/child-mind-institute-problematic-internet-use/train.csv"

    )

    test_data = pd.read_csv(

        "/kaggle/input/child-mind-institute-problematic-internet-use/test.csv"

    )

    data_dict = pd.read_csv(

        "/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv"

    )

    train_ts = load_time_series(

        "/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet"

    )

    test_ts = load_time_series(

        "/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet"

    )

 41%|████▏     | 413/996 [00:42<00:46, 12.56it/s]

In [None]:


time_series_cols = train_ts.columns.tolist()

time_series_cols.remove("id")

train_data = pd.merge(train_data, train_ts, how="left", on="id")

test_data = pd.merge(test_data, test_ts, how="left", on="id")

train_data = train_data.drop("id", axis=1)

In [None]:
column_names = list(test_data.columns)



target = train_data["sii"]

train_data = pd.DataFrame(train_data, columns=column_names)



train_data["sii"] = target



print(train_data.columns.difference(test_data.columns))

print(train_data.shape)

print(test_data.shape)

# Data Preprocessing


In [None]:
# Dropping ID columns.

ids = test_data["id"]



train_data = train_data.drop("id", axis=1)

test_data = test_data.drop("id", axis=1)



# Using one hot encoding on the categorical data.


train_data = pd.get_dummies(train_data)

test_data = pd.get_dummies(test_data)

train_data, test_data = train_data.align(test_data, join="outer", axis=1)


train_data.fillna(value=0, inplace=True)

test_data.fillna(value=0, inplace=True)



# Imputing missing data with SimpleImputer

print(train_data.shape)

print(test_data.shape)



print(train_data.info())

print(test_data.info())



difference = train_data.columns.difference(test_data.columns)

print(difference)



test_data = test_data.drop(columns=["sii"])

# Random Forest Model Predictions


In [None]:

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report



X = train_data.drop(columns=["sii"])

y = train_data["sii"]



# Scaling the training data.

scaler = MinMaxScaler()

X = scaler.fit_transform(X)



# Splitting the training and testing data

X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.1, random_state=42

)


# Initiating the random forrest model

RFC = RandomForestClassifier(n_estimators=100, random_state=42)



# Fitting the model

RFC.fit(X_train, y_train)



# Predicting test set results

y_pred_test = RFC.predict(X_test)

y_pred_train = RFC.predict(X_train)


print(

    "Testing data: Model accuracy score with 100 decision-trees : {0:0.4f}".format(

        accuracy_score(y_test, y_pred_test) * 100

    )

)

print(

    "Training data: Model accuracy score with 100 decision-trees : {0:0.4f}".format(

        accuracy_score(y_train, y_pred_train) * 100

    )

)

# Logistic Regression Model Predictions


In [None]:
from sklearn.linear_model import LogisticRegression



X = train_data.drop(columns=["sii"])

y = train_data["sii"]



scaler = MinMaxScaler()

X = scaler.fit_transform(X)



X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.1, random_state=42

)



LR = LogisticRegression()



LR.fit(X_train, y_train)



y_pred_test = LR.predict(X_test)

y_pred_train = RFC.predict(X_train)



print(

    "Testing data accuracy: {0:0.4f}".format(accuracy_score(y_test, y_pred_test) * 100)

)

print(

    "Training data accuracy: {0:0.4f}".format(

        accuracy_score(y_train, y_pred_train) * 100

    )

)

# Random Forest Predictions On Test Set


In [None]:
# X = test_data

# X = scaler.fit_transform(X)

# y_pred = RFC.predict(X)

# Predict on test data



# Creating submission file

# submission = pd.DataFrame({

#   'id': ids,

#   'sii': y_pred.astype(int)

# })

# print(submission)



# save to CSV

# submission.to_csv('submission.csv', index=False)

# print("Submission file created.")

# Logistic Regression Predictions On Test Set


In [None]:
# Predicting on test data

X = test_data

X = scaler.fit_transform(X)

y_pred = LR.predict(X)


submission = pd.DataFrame({"id": ids, "sii": y_pred.astype(int)})

submission.to_csv("submission.csv", index=False)

print(submission)

In [None]:

if processed_dataframes:
    combined_df = pd.concat(processed_dataframes, ignore_index=True)
    print("Data Summary:")
    print(combined_df.describe())
    
    # Visualization: Correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(combined_df.corr(), annot=True, cmap="coolwarm")
    plt.title("Correlation Heatmap")
    plt.show()

    # Visualization: Distribution of a key variable
    if "CGAS-CGAS_Score" in combined_df.columns:
        plt.figure(figsize=(8, 5))
        sns.histplot(combined_df["CGAS-CGAS_Score"], bins=30, kde=True)
        plt.title("Distribution of CGAS Scores")
        plt.xlabel("CGAS Score")
        plt.ylabel("Frequency")
        plt.show()
