# Loading The Data Sets


### Objective
Describe the purpose of the data loading and its importance in the project.

In [7]:
# Load the libraries

import numpy as np

import pandas as pd

import os

import seaborn as sns

import matplotlib.pyplot as plt

from pandas.plotting import andrews_curves

from sklearn.impute import SimpleImputer

from concurrent.futures import ThreadPoolExecutor

from tqdm import tqdm

In [8]:
# CP: Processes a parquet file

def process_file(filename, dirname):

    df = pd.read_parquet(os.path.join(dirname, filename, "part-0.parquet"))

    df.drop("step", axis=1, inplace=True)

    return df.describe().values.reshape(-1), filename.split("=")[1]





def load_time_series(dirname) -> pd.DataFrame:

    ids = os.listdir(dirname)

    with ThreadPoolExecutor() as executor:

        results = list(

            tqdm(

                executor.map(lambda fname: process_file(fname, dirname), ids),

                total=len(ids),

            )

        )

    stats, indexes = zip(*results)

    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])

    df["id"] = indexes

    return df

In [9]:
# CP: Load data

# CP: Check if you are running in Kaggle or locally



# CP: Running locally

if os.path.exists("kaggle_data"):

    train_data = pd.read_csv("kaggle_data/train.csv")

    test_data = pd.read_csv("kaggle_data/test.csv")

    data_dict = pd.read_csv("kaggle_data/data_dictionary.csv")

    train_ts = load_time_series("kaggle_data/series_train.parquet")

    test_ts = load_time_series("kaggle_data/series_test.parquet")



# CP: Running in Kaggle

else:

    train_data = pd.read_csv(

        "/kaggle/input/child-mind-institute-problematic-internet-use/train.csv"

    )

    test_data = pd.read_csv(

        "/kaggle/input/child-mind-institute-problematic-internet-use/test.csv"

    )

    data_dict = pd.read_csv(

        "/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv"

    )

    train_ts = load_time_series(

        "/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet"

    )

    test_ts = load_time_series(

        "/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet"

    )



column_names = list(test_data.columns)

100%|██████████| 996/996 [00:30<00:00, 32.97it/s]
100%|██████████| 2/2 [00:00<00:00, 18.85it/s]


In [13]:

time_series_cols = train_ts.columns.tolist()

time_series_cols.remove("id")

train_data = pd.merge(train_data, train_ts, how="left", on="id")

test_data = pd.merge(test_data, test_ts, how="left", on="id")

train_data = train_data.drop("id", axis=1)

In [14]:

# Dropping all columns that have less than 2000 examples.


import numpy as np 



import pandas as pd 



import os



import matplotlib.pyplot as plt



from sklearn.impute import SimpleImputer




target = train_data["sii"]

train_data = pd.DataFrame(train_data, columns=column_names)




train_data["sii"] = target


#Loading the train and test data sets



test_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')



column_names = list(test_data.columns)



train_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')



target = train_data['sii']



train_data = pd.DataFrame(train_data, columns = column_names)



train_data['sii'] = target




print(train_data.columns.difference(test_data.columns))



print(train_data.shape)


print(test_data.shape)




print(test_data.shape)


Index(['sii'], dtype='object')
(3960, 60)
(20, 155)


# Data Preprocessing


### Overview
Explain the preprocessing steps performed, why they were chosen, and their expected impact on modeling.

In [15]:

# Dropping ID columns.

ids = test_data["id"]



train_data = train_data.drop("id", axis=1)

test_data = test_data.drop("id", axis=1)



# Using one hot encoding on the categorical data.
#Dropping ID columns.



ids = test_data['id']

train_data = train_data.drop('id', axis=1)



test_data = test_data.drop('id', axis=1)



#one hot encoding on the categorical data. 


train_data = pd.get_dummies(train_data)



test_data = pd.get_dummies(test_data)


train_data, test_data = train_data.align(test_data, join="outer", axis=1)





train_data, test_data = train_data.align(test_data, join='outer', axis = 1)




train_data.fillna(value=0, inplace=True)



test_data.fillna(value=0, inplace=True)




# Imputing missing data with SimpleImputer



#Imputing missing data with SimpleImputer




print(train_data.shape)



print(test_data.shape)



print(train_data.info())



print(test_data.info())




difference = train_data.columns.difference(test_data.columns)

print(difference)



test_data = test_data.drop(columns=["sii"])


difference = (train_data.columns.difference(test_data.columns))



print(difference)



test_data = test_data.drop(columns=['sii'])  


(3960, 185)
(20, 185)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3960 entries, 0 to 3959
Columns: 185 entries, BIA-BIA_Activity_Level_num to stat_95
dtypes: bool(40), float64(143), int64(2)
memory usage: 4.5 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Columns: 185 entries, BIA-BIA_Activity_Level_num to stat_95
dtypes: bool(35), float64(148), int64(2)
memory usage: 24.2 KB
None
Index([], dtype='object')


# Random Forest Model Predictions


### Evaluation
Summarize the metrics used for evaluating Random Forest predictions.

In [16]:



from sklearn.model_selection import train_test_split



from sklearn.preprocessing import MinMaxScaler



from sklearn.ensemble import RandomForestClassifier



from sklearn.metrics import accuracy_score, classification_report




X = train_data.drop(columns=["sii"])

y = train_data["sii"]



# Scaling the training data.




X = train_data.drop(columns=['sii'])  



y = train_data['sii']  


scaler = MinMaxScaler()



X = scaler.fit_transform(X)




# Splitting the training and testing data

X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.1, random_state=42

)

# Initiating the random forrest model

RFC = RandomForestClassifier(n_estimators=100, random_state=42)



# Fitting the model

RFC.fit(X_train, y_train)

======



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)



#Initiating the random forrest model 



RFC = RandomForestClassifier(n_estimators=100, random_state=42)



#Fit the model




RFC.fit(X_train, y_train)




# Predicting test set results





y_pred_test = RFC.predict(X_test)



y_pred_train = RFC.predict(X_train)




print(

    "Testing data: Model accuracy score with 100 decision-trees : {0:0.4f}".format(

        accuracy_score(y_test, y_pred_test) * 100

    )

)

print(

    "Training data: Model accuracy score with 100 decision-trees : {0:0.4f}".format(

        accuracy_score(y_train, y_pred_train) * 100

    )

)


print('Testing data: Model accuracy score with 100 decision-trees : {0:0.4f}'.format(accuracy_score(y_test, y_pred_test)*100))



print('Training data: Model accuracy score with 100 decision-trees : {0:0.4f}'.format(accuracy_score(y_train, y_pred_train)*100))


Testing data: Model accuracy score with 100 decision-trees : 71.9697
Training data: Model accuracy score with 100 decision-trees : 100.0000


# Logistic Regression Model Predictions


### Comparison
Discuss how Logistic Regression compares to Random Forest in terms of performance.

In [17]:
from sklearn.linear_model import LogisticRegression




X = train_data.drop(columns=["sii"])

y = train_data["sii"]






X = train_data.drop(columns=['sii'])  



y = train_data['sii']  








scaler = MinMaxScaler()



X = scaler.fit_transform(X)




X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.1, random_state=42

)






X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)



LR = LogisticRegression()


LR.fit(X_train, y_train)


y_pred_test = LR.predict(X_test)



y_pred_train = RFC.predict(X_train)




print(

    "Testing data accuracy: {0:0.4f}".format(accuracy_score(y_test, y_pred_test) * 100)

)

print(

    "Training data accuracy: {0:0.4f}".format(

        accuracy_score(y_train, y_pred_train) * 100

    )

)






print('Testing data accuracy: {0:0.4f}'.format(accuracy_score(y_test, y_pred_test)*100))



print('Training data accuracy: {0:0.4f}'.format(accuracy_score(y_train, y_pred_train)*100))


Testing data accuracy: 72.4747
Training data accuracy: 100.0000


# Random Forest Predictions On Test Set


In [18]:

# X = test_data

# X = scaler.fit_transform(X)

# y_pred = RFC.predict(X)

# Predict on test data



# Creating submission file

# submission = pd.DataFrame({

#   'id': ids,

#   'sii': y_pred.astype(int)

# })

# print(submission)


#X = test_data  



#X = scaler.fit_transform(X)



#y_pred = RFC.predict(X)



# Predict on test data



#submission = pd.DataFrame({



 #   'id': ids,  



 #   'sii': y_pred.astype(int) 



#})


# Logistic Regression Predictions On Test Set


In [19]:

# Predicting on test data

X = test_data




X = test_data  




X = scaler.fit_transform(X)



y_pred = LR.predict(X)




submission = pd.DataFrame({"id": ids, "sii": y_pred.astype(int)})

submission.to_csv("submission.csv", index=False)



submission = pd.DataFrame({



    'id': ids,  



    'sii': y_pred.astype(int) 



})

submission.to_csv('submission.csv', index=False)



print(submission)

          id  sii
0   00008ff9    0
1   000fd460    0
2   00105258    2
3   00115b9f    0
4   0016bb22    0
5   001f3379    0
6   0038ba98    2
7   0068a485    0
8   0069fbed    0
9   0083e397    0
10  0087dd65    0
11  00abe655    2
12  00ae59c9    3
13  00af6387    0
14  00bd4359    0
15  00c0cd71    1
16  00d56d4b    0
17  00d9913d    0
18  00e6167c    0
19  00ebc35d    0


<span style="color:red"><b><<<<<<< local</b></span>

<span style="color:red"><b>=======</b></span>

<span style="color:red"><b>>>>>>>> remote</b></span>