# Loading The Data Sets

In [1]:
import numpy as np 

import pandas as pd 

import os

import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer


#Loading the train and test data sets

test_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

column_names = list(test_data.columns)

train_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

target = train_data['sii']

train_data = pd.DataFrame(train_data, columns = column_names)

train_data['sii'] = target

print(train_data.columns.difference(test_data.columns))

print(train_data.shape)

print(test_data.shape)


Index(['sii'], dtype='object')
(3960, 60)
(20, 59)


# Data Preprocessing 

In [2]:
#Dropping ID columns.

ids = test_data['id']
train_data = train_data.drop('id', axis=1)

test_data = test_data.drop('id', axis=1)

#one hot encoding on the categorical data. 

train_data = pd.get_dummies(train_data)

test_data = pd.get_dummies(test_data)

train_data, test_data = train_data.align(test_data, join='outer', axis = 1)

train_data.fillna(value=0, inplace=True)

test_data.fillna(value=0, inplace=True)

#Imputing missing data with SimpleImputer

print(train_data.shape)

print(test_data.shape)

print(train_data.info())

print(test_data.info())

difference = (train_data.columns.difference(test_data.columns))

print(difference)

test_data = test_data.drop(columns=['sii'])  


(3960, 89)
(20, 89)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3960 entries, 0 to 3959
Data columns (total 89 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   BIA-BIA_Activity_Level_num              3960 non-null   float64
 1   BIA-BIA_BMC                             3960 non-null   float64
 2   BIA-BIA_BMI                             3960 non-null   float64
 3   BIA-BIA_BMR                             3960 non-null   float64
 4   BIA-BIA_DEE                             3960 non-null   float64
 5   BIA-BIA_ECW                             3960 non-null   float64
 6   BIA-BIA_FFM                             3960 non-null   float64
 7   BIA-BIA_FFMI                            3960 non-null   float64
 8   BIA-BIA_FMI                             3960 non-null   float64
 9   BIA-BIA_Fat                             3960 non-null   float64
 10  BIA-BIA_Frame_num                       

# Random Forest Model Predictions

In [3]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report


X = train_data.drop(columns=['sii'])  

y = train_data['sii']  


#Scaling the training data. 

scaler = MinMaxScaler()

X = scaler.fit_transform(X)


#Splitting the training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

#Initiating the random forrest model 

RFC = RandomForestClassifier(n_estimators=100, random_state=42)

#Fit the model

RFC.fit(X_train, y_train)

#Predicting test set results

y_pred_test = RFC.predict(X_test)

y_pred_train = RFC.predict(X_train)

print('Testing data: Model accuracy score with 100 decision-trees : {0:0.4f}'.format(accuracy_score(y_test, y_pred_test)*100))

print('Training data: Model accuracy score with 100 decision-trees : {0:0.4f}'.format(accuracy_score(y_train, y_pred_train)*100))


Testing data: Model accuracy score with 100 decision-trees : 74.4949
Training data: Model accuracy score with 100 decision-trees : 100.0000


# Logistic Regression Model Predictions

In [4]:
from sklearn.linear_model import LogisticRegression



X = train_data.drop(columns=['sii'])  

y = train_data['sii']  



scaler = MinMaxScaler()

X = scaler.fit_transform(X)



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)



LR = LogisticRegression()



LR.fit(X_train, y_train)



y_pred_test = LR.predict(X_test)

y_pred_train = RFC.predict(X_train)



print('Testing data accuracy: {0:0.4f}'.format(accuracy_score(y_test, y_pred_test)*100))

print('Training data accuracy: {0:0.4f}'.format(accuracy_score(y_train, y_pred_train)*100))

Testing data accuracy: 72.4747
Training data accuracy: 100.0000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Random Forest Predictions On Test Set

In [5]:
#X = test_data  

#X = scaler.fit_transform(X)

#y_pred = RFC.predict(X)

# Predict on test data

#submission = pd.DataFrame({

 #   'id': ids,  

 #   'sii': y_pred.astype(int) 

#})

#print(submission)

# Logistic Regression Predictions On Test Set 

In [6]:
#Predicting on test data

X = test_data  

X = scaler.fit_transform(X)

y_pred = LR.predict(X)



#Creating the submission file

submission = pd.DataFrame({

    'id': ids,  

    'sii': y_pred.astype(int) 

})

submission.to_csv('submission.csv', index=False)

print(submission)

          id  sii
0   00008ff9    0
1   000fd460    0
2   00105258    2
3   00115b9f    0
4   0016bb22    0
5   001f3379    0
6   0038ba98    2
7   0068a485    0
8   0069fbed    0
9   0083e397    0
10  0087dd65    0
11  00abe655    2
12  00ae59c9    3
13  00af6387    0
14  00bd4359    0
15  00c0cd71    1
16  00d56d4b    0
17  00d9913d    0
18  00e6167c    0
19  00ebc35d    0
