# Child Mind Institute — Problematic Internet Use

## Team notes

from Célie, 11/2:

- Hi Anusha! I will label my work with a comment and my initials (CP).
- You can link the Kaggle notebook to your GitHub from within Kaggle.
- If using GitHub, save data locally in file kaggle_data (file is ignored by git)
- If using Kaggle notebook, data is located in: /kaggle/input/child-mind-institute-problematic-internet-use/
- I am using my local directory but if you are using Kaggle, you'll need to switch the commented out code.


In [1]:
# CP: Load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# CP: Show available files

# CP: If using local directory
import os
for dirname, _, filenames in os.walk('kaggle_data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# CP: If using Kaggle
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))


kaggle_data/test.csv
kaggle_data/sample_submission.csv
kaggle_data/data_dictionary.csv
kaggle_data/train.csv
kaggle_data/series_test.parquet/id=00115b9f/part-0.parquet
kaggle_data/series_test.parquet/id=001f3379/part-0.parquet
kaggle_data/series_train.parquet/id=e0ad6550/part-0.parquet
kaggle_data/series_train.parquet/id=764d8b42/part-0.parquet
kaggle_data/series_train.parquet/id=5af79adb/part-0.parquet
kaggle_data/series_train.parquet/id=03a9019b/part-0.parquet
kaggle_data/series_train.parquet/id=a3b146ca/part-0.parquet
kaggle_data/series_train.parquet/id=f3020788/part-0.parquet
kaggle_data/series_train.parquet/id=56cb7161/part-0.parquet
kaggle_data/series_train.parquet/id=ec13a1b9/part-0.parquet
kaggle_data/series_train.parquet/id=f5b2acf0/part-0.parquet
kaggle_data/series_train.parquet/id=7f26f78c/part-0.parquet
kaggle_data/series_train.parquet/id=cd8f3c61/part-0.parquet
kaggle_data/series_train.parquet/id=b7c410a9/part-0.parquet
kaggle_data/series_train.parquet/id=93c72a97/part-0.p

In [3]:
# CP: Load data
# CP: If using local directory
train_data = pd.read_csv('kaggle_data/train.csv')
test_data = pd.read_csv('kaggle_data/test.csv')

# CP: If using Kaggle
# train_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
# test_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

train_data.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,


In [11]:
# CP: Show all rows when displaying data
pd.set_option('display.max_rows', None)

# CP: Load data dictionary
data_dict = pd.read_csv('kaggle_data/data_dictionary.csv')
data_dict

Unnamed: 0,Instrument,Field,Description,Type,Values,Value Labels
0,Identifier,id,Participant's ID,str,,
1,Demographics,Basic_Demos-Enroll_Season,Season of enrollment,str,"Spring, Summer, Fall, Winter",
2,Demographics,Basic_Demos-Age,Age of participant,float,,
3,Demographics,Basic_Demos-Sex,Sex of participant,categorical int,01,"0=Male, 1=Female"
4,Children's Global Assessment Scale,CGAS-Season,Season of participation,str,"Spring, Summer, Fall, Winter",
5,Children's Global Assessment Scale,CGAS-CGAS_Score,Children's Global Assessment Scale Score,int,,
6,Physical Measures,Physical-Season,Season of participation,str,"Spring, Summer, Fall, Winter",
7,Physical Measures,Physical-BMI,Body Mass Index (kg/m^2),float,,
8,Physical Measures,Physical-Height,Height (in),float,,
9,Physical Measures,Physical-Weight,Weight (lbs),float,,


In [5]:
# CP: Explore data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3960 entries, 0 to 3959
Data columns (total 82 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      3960 non-null   object 
 1   Basic_Demos-Enroll_Season               3960 non-null   object 
 2   Basic_Demos-Age                         3960 non-null   int64  
 3   Basic_Demos-Sex                         3960 non-null   int64  
 4   CGAS-Season                             2555 non-null   object 
 5   CGAS-CGAS_Score                         2421 non-null   float64
 6   Physical-Season                         3310 non-null   object 
 7   Physical-BMI                            3022 non-null   float64
 8   Physical-Height                         3027 non-null   float64
 9   Physical-Weight                         3076 non-null   float64
 10  Physical-Waist_Circumference            898 non-null    floa

In [6]:
# CP: Explore data
train_data.shape

(3960, 82)

In [7]:
# CP: Explore data
train_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Basic_Demos-Age,3960.0,10.433586,3.574648,5.0,8.0,10.0,13.0,22.0
Basic_Demos-Sex,3960.0,0.372727,0.483591,0.0,0.0,0.0,1.0,1.0
CGAS-CGAS_Score,2421.0,65.454771,22.341862,25.0,59.0,65.0,75.0,999.0
Physical-BMI,3022.0,19.331929,5.113934,0.0,15.86935,17.937682,21.571244,59.132048
Physical-Height,3027.0,55.946713,7.473764,33.0,50.0,55.0,62.0,78.5
Physical-Weight,3076.0,89.038615,44.56904,0.0,57.2,77.0,113.8,315.0
Physical-Waist_Circumference,898.0,27.278508,5.567287,18.0,23.0,26.0,30.0,50.0
Physical-Diastolic_BP,2954.0,69.648951,13.611226,0.0,61.0,68.0,76.0,179.0
Physical-HeartRate,2967.0,81.597236,13.665196,27.0,72.0,81.0,90.5,138.0
Physical-Systolic_BP,2954.0,116.983074,17.061225,0.0,107.0,114.0,125.0,203.0


In [8]:
# CP: Check target values
train_data['sii'].value_counts()

sii
0.0    1594
1.0     730
2.0     378
3.0      34
Name: count, dtype: int64

In [9]:
# CP: Check target values
train_data['sii'].isnull().sum()

np.int64(1224)