In [None]:
!pip install pandas scikit-learn numpy altair

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import cohen_kappa_score, make_scorer
import numpy as np
import altair as alt

In [None]:
# Mount Google Drive (execute this cell first)
from google.colab import drive
drive.mount('/content/drive')

In [4]:
# Data Loading (Colab) - Adjust the file paths if needed
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')


In [5]:
# Install pyarrow if not already installed
try:
    import pyarrow.parquet as pq
except ImportError:
    !pip install pyarrow
    import pyarrow.parquet as pq

In [None]:
import pyarrow.parquet as pq
import pandas as pd

# 1. Efficient Loading with iter_row_groups

# Training Data
for i in range(pq.read_table('/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet').num_row_groups):
    train_chunk = pq.read_table('/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet').read_row_group(i).to_pandas()
    # Now, 'train_chunk' is a Pandas DataFrame containing a portion of the training data.
    # Process this chunk as needed (e.g., feature engineering, modeling, etc.).
    # ... your processing code here ...

# Test Data
for i in range(pq.read_table('/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet').num_row_groups):
    test_chunk = pq.read_table('/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet').read_row_group(i).to_pandas()
    # Similarly, 'test_chunk' is a Pandas DataFrame with a portion of the test data.
    # Process this chunk as needed.
    # ... your processing code here ...

In [None]:
# Data Preprocessing

# Drop 'PCIAT' columns from train_df
pciat_columns = [col for col in train_df.columns if col.startswith('PCIAT')]
train_df.drop(pciat_columns, axis=1, inplace=True)

In [None]:
# Handle Missing Values
# Impute numerical columns with medians and categorical columns with modes for train_df
for column in train_df.columns:
    if train_df[column].dtype == 'float64' or train_df[column].dtype == 'int64':  # Numerical columns
        train_df[column].fillna(train_df[column].median(), inplace=True)
    else:  # Categorical columns
        train_df[column].fillna(train_df[column].mode()[0], inplace=True)


In [None]:
# Impute numerical columns in test_df using medians from train_df
for column in test_df.columns:
    if test_df[column].dtype == 'float64' or test_df[column].dtype == 'int64':
        test_df[column].fillna(train_df[column].median(), inplace=True)
    else:
        test_df[column].fillna(train_df[column].mode()[0], inplace=True)


In [None]:
# Encode Categorical Variables
# Get categorical columns from data dictionary
categorical_columns = data_dictionary_df[data_dictionary_df['Type'] == 'categorical int']['Field'].tolist()


In [None]:
# Exclude 'id' from categorical columns
categorical_columns = [col for col in categorical_columns if col in train_df.columns and col != 'id']

In [None]:
# Apply one-hot encoding
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_train = pd.DataFrame(encoder.fit_transform(train_df[categorical_columns]).toarray())
encoded_test = pd.DataFrame(encoder.transform(test_df[categorical_columns]).toarray())

In [None]:
# Rename encoded columns
encoded_train.columns = encoder.get_feature_names_out(categorical_columns)
encoded_test.columns = encoder.get_feature_names_out(categorical_columns)


In [None]:
# Concatenate encoded features with original DataFrames
train_df = pd.concat([train_df.drop(categorical_columns, axis=1), encoded_train], axis=1)
test_df = pd.concat([test_df.drop(categorical_columns, axis=1), encoded_test], axis=1)

In [None]:
# 3. Refine Actigraphy Feature Engineering
# Revisit total_duration_hours calculation (adjust based on actual data meaning)
def extract_features(df):
    # ... (other feature calculations)

    # Assuming 'step' represents 5-second intervals and we want total duration in hours
    features['total_duration_hours'] = (
        df.groupby('id')['step'].max() * 5 / 3600
    )

    # ... (rest of the function)


In [None]:
# Add more features (example: mean and std of X, Y, Z)
def extract_features(df):
    # ... (other feature calculations)

    features['mean_X'] = df.groupby('id')['X'].mean()
    features['std_X'] = df.groupby('id')['X'].std()
    features['mean_Y'] = df.groupby('id')['Y'].mean()
    features['std_Y'] = df.groupby('id')['Y'].std()
    features['mean_Z'] = df.groupby('id')['Z'].mean()
    features['std_Z'] = df.groupby('id')['Z'].std()

    # ... (rest of the function)

In [None]:
print(actigraphy_train_df.columns)
print(actigraphy_test_df.columns)

In [None]:
# Check if the column exists (case-sensitive)
if 'non_wear_flag' in actigraphy_train_df.columns:
    actigraphy_train_df = actigraphy_train_df[actigraphy_train_df['non_wear_flag'] == 1]
else:
    print("Warning: 'non_wear_flag' column not found in actigraphy_train_df. Skipping filtering.")

if 'non_wear_flag' in actigraphy_test_df.columns:
    actigraphy_test_df = actigraphy_test_df[actigraphy_test_df['non_wear_flag'] == 1]
else:
    print("Warning: 'non_wear_flag' column not found in actigraphy_test_df. Skipping filtering.")

In [None]:
# Check if the column exists before accessing it
if 'non_wear_flag' in actigraphy_train_df.columns:
    # Check for missing values in 'non_wear_flag'
    if actigraphy_train_df['non_wear_flag'].isnull().any():
        # Decide how to handle missing values (e.g., drop rows, impute)
        # Example: Drop rows with missing values
        actigraphy_train_df.dropna(subset=['non_wear_flag'], inplace=True)

    # Filter out non-wear periods
    actigraphy_train_df = actigraphy_train_df[actigraphy_train_df['non_wear_flag'] == 1]
else:
    print("Warning: 'non_wear_flag' column not found in actigraphy_train_df. Skipping filtering.")

# ... (similar check for actigraphy_test_df)

In [None]:
# Check if the column exists before accessing it
if 'non_wear_flag' in actigraphy_train_df.columns:
    # Check for missing values in 'non_wear_flag'
    if actigraphy_train_df['non_wear_flag'].isnull().any():
        # Decide how to handle missing values (e.g., drop rows, impute)
        # Example: Drop rows with missing values
        actigraphy_train_df.dropna(subset=['non_wear_flag'], inplace=True)

    # Filter out non-wear periods
    actigraphy_train_df = actigraphy_train_df[actigraphy_train_df['non_wear_flag'] == 1]
else:
    print("Warning: 'non_wear_flag' column not found in actigraphy_train_df. Skipping filtering.")

# ... (similar check for actigraphy_test_df)

In [None]:
# Check if the column exists before accessing it
if 'non_wear_flag' in actigraphy_train_df.columns:
    # Check for missing values in 'non_wear_flag'
    if actigraphy_train_df['non_wear_flag'].isnull().any():
        # Decide how to handle missing values (e.g., drop rows, impute)
        # Example: Drop rows with missing values
        actigraphy_train_df.dropna(subset=['non_wear_flag'], inplace=True)

    # Filter out non-wear periods
    actigraphy_train_df = actigraphy_train_df[actigraphy_train_df['non_wear_flag'] == 1]
else:
    print("Warning: 'non_wear_flag' column not found in actigraphy_train_df. Skipping filtering.")

# ... (similar check for actigraphy_test_df)

In [None]:
# Check if the column exists before accessing it
if 'non_wear_flag' in actigraphy_train_df.columns:
    actigraphy_train_df = actigraphy_train_df[actigraphy_train_df['non_wear_flag'] == 0]
else:
    print("Warning: 'non_wear_flag' column not found in actigraphy_train_df. Skipping filtering.")

if 'non_wear_flag' in actigraphy_test_df.columns:
    actigraphy_test_df = actigraphy_test_df[actigraphy_test_df['non_wear_flag'] == 0]
else:
    print("Warning: 'non_wear_flag' column not found in actigraphy_test_df. Skipping filtering.")

In [None]:
# Feature Engineering (from actigraphy data)
# Preprocess Actigraphy Data
actigraphy_train_df = actigraphy_train_df[actigraphy_train_df['non_wear_flag'] == 0]
actigraphy_test_df = actigraphy_test_df[actigraphy_test_df['non_wear_flag'] == 0]


In [None]:
# Feature Extraction
actigraphy_train_features = extract_features(actigraphy_train_df)
actigraphy_test_features = extract_features(actigraphy_test_df)

In [None]:
# Merge with Tabular Data
train_df = train_df.merge(actigraphy_train_features, on='id', how='left')
test_df = test_df.merge(actigraphy_test_features, on='id', how='left')

In [None]:
# 4. Merge and Check for NaN Values
# ... (merge actigraphy features)

# Check for NaN values after merging
print("\nMissing Values in Train Data after merging:")
print((train_df.isnull().sum() / len(train_df) * 100).to_markdown(numalign="left", stralign="left"))

print("\nMissing Values in Test Data after merging:")
print((test_df.isnull().sum() / len(test_df) * 100).to_markdown(numalign="left", stralign="left"))

In [None]:
# Impute or remove NaN values (choose appropriate strategy)
# Example: Impute with median for numerical columns
for col in train_df.select_dtypes(include=[np.number]).columns:
    train_df[col].fillna(train_df[col].median(), inplace=True)
    test_df[col].fillna(train_df[col].median(), inplace=True)  # Use train_df median for test_df


In [None]:
# 1. Analyze Numerical Feature Distributions
from scipy.stats import skew, kurtosis

In [None]:
# Calculate skewness and kurtosis for numerical columns
numerical_columns = train_df.select_dtypes(include=[np.number]).columns.tolist()
skewness = train_df[numerical_columns].skew()
kurtosis = train_df[numerical_columns].kurtosis()


In [None]:
# Identify columns with high skewness or kurtosis (you can adjust the thresholds)
high_skew_columns = skewness[abs(skewness) > 1].index.tolist()
high_kurtosis_columns = kurtosis[abs(kurtosis) > 3].index.tolist()


In [None]:
# Display summary statistics for these columns
print("Summary statistics for numerical features with high skewness or kurtosis:")
print(train_df[high_skew_columns + high_kurtosis_columns].describe().to_markdown(numalign="left", stralign="left"))

In [None]:
# Visualize distributions of features with high skewness or kurtosis
for col in high_skew_columns + high_kurtosis_columns:
    chart = alt.Chart(train_df).mark_bar().encode(
        x=alt.X(col + ':Q', bin=True),
        y='count()',
        tooltip=[col, 'count()']
    ).properties(
        title=f'Distribution of {col}'
    )
    chart.save(f'distribution_{col}.json')

In [None]:
# 2. Assess Categorical Feature Cardinality
# Count unique categories for categorical columns
categorical_columns = train_df.select_dtypes(include=['object']).columns.tolist()
cardinality = train_df[categorical_columns].nunique()

In [None]:
# Identify high-cardinality columns (adjust the threshold if needed)
high_cardinality_columns