# Loading The Data Sets

In [1]:
import numpy as np 

import pandas as pd 

import os

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer


#Loading the train and test data sets
# Aligning train dataset columns to match test dataset
# Checking for column differences between train and test datasets

test_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

column_names = list(test_data.columns)

train_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

target = train_data['sii']

train_data = pd.DataFrame(train_data, columns = column_names)

train_data['sii'] = target

print(train_data.columns.difference(test_data.columns))

print(train_data.shape)

print(test_data.shape)


Index(['sii'], dtype='object')
(3960, 60)
(20, 59)


# Data Preprocessing 

In [2]:
# Dropping the 'id' column from both train and test datasets
ids = test_data['id']
train_data = train_data.drop('id', axis=1)
test_data = test_data.drop('id', axis=1)

# Apply one-hot encoding to categorical columns
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

# Align train and test datasets to ensure they have the same columns
# Missing columns will be added with NaN values
train_data, test_data = train_data.align(test_data, join='outer', axis=1)

# Fill missing values with 0 (useful for alignment step)
train_data.fillna(value=0, inplace=True)
test_data.fillna(value=0, inplace=True)

# Print the shapes of the datasets to verify alignment
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

# Display dataset info to inspect column data types and memory usage
print("\nTrain data info:")
print(train_data.info())
print("\nTest data info:")
print(test_data.info())

# Check for any column differences
difference = train_data.columns.difference(test_data.columns)
if difference.empty:
    print("No column differences between train and test datasets.")
else:
    print("Column differences found:", difference)

# Remove target column 'sii' from test dataset, as it's not available for predictions
test_data = test_data.drop(columns=['sii'], errors='ignore')

# Display final column details
print("\nFinal train columns:", train_data.columns)
print("\nFinal test columns:", test_data.columns)



Train data shape: (3960, 89)
Test data shape: (20, 89)

Train data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3960 entries, 0 to 3959
Data columns (total 89 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   BIA-BIA_Activity_Level_num              3960 non-null   float64
 1   BIA-BIA_BMC                             3960 non-null   float64
 2   BIA-BIA_BMI                             3960 non-null   float64
 3   BIA-BIA_BMR                             3960 non-null   float64
 4   BIA-BIA_DEE                             3960 non-null   float64
 5   BIA-BIA_ECW                             3960 non-null   float64
 6   BIA-BIA_FFM                             3960 non-null   float64
 7   BIA-BIA_FFMI                            3960 non-null   float64
 8   BIA-BIA_FMI                             3960 non-null   float64
 9   BIA-BIA_Fat                             3960 non-null   

# Random Forest Model Predictions

In [3]:

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report



X = train_data.drop(columns=['sii'])  

y = train_data['sii']  



#Scaling the training data. 

scaler = MinMaxScaler()

X = scaler.fit_transform(X)



#Splitting the training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)



#Initiating the random forrest model 

RFC = RandomForestClassifier(n_estimators=100, random_state=42)



#Fitting the model

RFC.fit(X_train, y_train)





#Predicting test set results

y_pred_test = RFC.predict(X_test)

y_pred_train = RFC.predict(X_train)



#Checking accuracy score 

print('Testing data: Model accuracy score with 100 decision-trees : {0:0.4f}'.format(accuracy_score(y_test, y_pred_test)*100))

print('Training data: Model accuracy score with 100 decision-trees : {0:0.4f}'.format(accuracy_score(y_train, y_pred_train)*100))


Testing data: Model accuracy score with 100 decision-trees : 74.4949
Training data: Model accuracy score with 100 decision-trees : 100.0000


# Logistic Regression Model Predictions

In [4]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import numpy as np

# Prepare features and target
X = train_data.drop(columns=['sii'])  
y = train_data['sii']  

# Scale features
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=42)

# Initialize Logistic Regression
LR = LogisticRegression(C=1.0, solver='lbfgs', max_iter=500, random_state=42)

# Fit the model
LR.fit(X_train, y_train)

# Predict on train and test data
y_pred_test = LR.predict(X_test)
y_pred_train = LR.predict(X_train)

# Evaluate model
print('Testing data accuracy: {0:0.4f}'.format(accuracy_score(y_test, y_pred_test) * 100))
print('Training data accuracy: {0:0.4f}'.format(accuracy_score(y_train, y_pred_train) * 100))

# Classification report for detailed metrics
print("\nClassification Report for Testing Data:")
print(classification_report(y_test, y_pred_test))

# Cross-validation scores
cv_scores = cross_val_score(LR, X_resampled, y_resampled, cv=5, scoring='accuracy')
print("\nCross-validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy: {0:0.4f}".format(np.mean(cv_scores)))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Testing data accuracy: 70.0355
Training data accuracy: 69.7555

Classification Report for Testing Data:
              precision    recall  f1-score   support

         0.0       0.74      0.74      0.74       276
         1.0       0.57      0.49      0.53       283
         2.0       0.57      0.57      0.57       280
         3.0       0.88      0.99      0.93       289

    accuracy                           0.70      1128
   macro avg       0.69      0.70      0.69      1128
weighted avg       0.69      0.70      0.69      1128



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Cross-validation Accuracy Scores: [0.67006652 0.68736142 0.69565217 0.70230701 0.70629991]
Mean CV Accuracy: 0.6923


# Random Forest Predictions On Test Set

In [5]:
# X = test_data  

# X = scaler.fit_transform(X)

# y_pred = RFC.predict(X)


# submission = pd.DataFrame({

#    'id': ids,  

#    'sii': y_pred.astype(int) 

# })

# print(submission)


# submission.to_csv('submission.csv', index=False)

# print("Submission file created.")

# Logistic Regression Predictions On Test Set 

In [6]:
#Predicting on test data

X = test_data  

X = scaler.fit_transform(X)

y_pred = LR.predict(X)


submission = pd.DataFrame({

    'id': ids,  

    'sii': y_pred.astype(int) 

})



#saving to CSV

submission.to_csv('submission.csv', index=False)

print(submission)

          id  sii
0   00008ff9    0
1   000fd460    2
2   00105258    2
3   00115b9f    2
4   0016bb22    0
5   001f3379    2
6   0038ba98    2
7   0068a485    0
8   0069fbed    0
9   0083e397    0
10  0087dd65    0
11  00abe655    2
12  00ae59c9    2
13  00af6387    0
14  00bd4359    0
15  00c0cd71    1
16  00d56d4b    0
17  00d9913d    0
18  00e6167c    2
19  00ebc35d    0
