In [1]:
import os
import tarfile
import zipfile

DOWNLOAD_ROOT = "/home/gcullen/Downloads/young-people-survey"
PATH_TO_ZIP = "/home/gcullen/Downloads/young-people-survey.zip"

def fetch_child_data(download_path=DOWNLOAD_ROOT, path_to_zip_file=PATH_TO_ZIP): 
    os.makedirs(download_path, exist_ok=True)
    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
        zip_ref.extractall(download_path)
    

In [2]:
fetch_child_data()

import pandas as pd
import numpy as np

def load_child_data(path=DOWNLOAD_ROOT):
    csv_path = os.path.join(path, "responses.csv")
    return pd.read_csv(csv_path)


In [3]:
data = load_child_data()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Columns: 150 entries, Music to House - block of flats
dtypes: float64(134), int64(5), object(11)
memory usage: 1.2+ MB


In [4]:
data.describe()

Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,Metal or Hardrock,...,Shopping centres,Branded clothing,Entertainment spending,Spending on looks,Spending on gadgets,Spending on healthy eating,Age,Height,Weight,Number of siblings
count,1007.0,1008.0,1006.0,1005.0,1005.0,1003.0,1008.0,1007.0,1004.0,1007.0,...,1008.0,1008.0,1007.0,1007.0,1010.0,1008.0,1003.0,990.0,990.0,1004.0
mean,4.731877,3.328373,3.11332,2.288557,2.123383,2.956132,2.761905,3.471698,3.761952,2.36147,...,3.234127,3.050595,3.201589,3.106256,2.870297,3.55754,20.433699,173.514141,66.405051,1.297809
std,0.664049,0.833931,1.170568,1.138916,1.076136,1.25257,1.260845,1.1614,1.184861,1.372995,...,1.323062,1.306321,1.188947,1.205368,1.28497,1.09375,2.82884,10.024505,13.839561,1.013348
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,15.0,62.0,41.0,0.0
25%,5.0,3.0,2.0,1.0,1.0,2.0,2.0,3.0,3.0,1.0,...,2.0,2.0,2.0,2.0,2.0,3.0,19.0,167.0,55.0,1.0
50%,5.0,3.0,3.0,2.0,2.0,3.0,3.0,4.0,4.0,2.0,...,3.0,3.0,3.0,3.0,3.0,4.0,20.0,173.0,64.0,1.0
75%,5.0,4.0,4.0,3.0,3.0,4.0,4.0,4.0,5.0,3.0,...,4.0,4.0,4.0,4.0,4.0,4.0,22.0,180.0,75.0,2.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,30.0,203.0,165.0,10.0


In [5]:
corr_matrix = data.corr()
corr_matrix['Height'].sort_values(ascending=False)

Height            1.000000
Weight            0.697696
PC                0.357822
Cars              0.336885
Action            0.298112
                    ...   
Reading          -0.226990
Theatre          -0.254357
Shopping         -0.254610
Romantic         -0.299142
Life struggles   -0.374265
Name: Height, Length: 139, dtype: float64

In [6]:
data_adj = data[['Shopping','Pop','PC','Dancing','Sci-fi','Life struggles','Gender','Romantic','Theatre']].copy()
dataDrop = data_adj.dropna(subset=['Shopping','Pop','PC','Dancing','Sci-fi','Life struggles','Gender','Romantic','Theatre'])
dataDrop

Unnamed: 0,Shopping,Pop,PC,Dancing,Sci-fi,Life struggles,Gender,Romantic,Theatre
0,4.0,5.0,3.0,3.0,4.0,1.0,female,4.0,2.0
1,3.0,3.0,4.0,1.0,4.0,1.0,female,3.0,2.0
2,4.0,3.0,2.0,5.0,4.0,4.0,female,2.0,5.0
3,4.0,2.0,1.0,1.0,4.0,3.0,female,3.0,1.0
4,3.0,5.0,2.0,1.0,3.0,2.0,female,2.0,2.0
...,...,...,...,...,...,...,...,...,...
1005,5.0,4.0,3.0,2.0,4.0,4.0,female,5.0,5.0
1006,2.0,4.0,5.0,5.0,5.0,1.0,male,1.0,1.0
1007,2.0,3.0,4.0,1.0,2.0,5.0,female,5.0,2.0
1008,5.0,3.0,4.0,4.0,1.0,5.0,female,3.0,5.0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')
tr_data, te_data = train_test_split(dataDrop, test_size=0.2, random_state=58)

trainLabels = tr_data[['Gender']].copy()
testLabels = te_data[['Gender']].copy()

tdata = tr_data.drop('Gender', axis=1)
tedata = te_data.drop('Gender', axis=1)

In [8]:
def convert_to_onehot(x):
    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(x)
    return enc.transform(x).toarray()
    

In [9]:
oneHotTrainLabels = convert_to_onehot(trainLabels)
oneHotTestLabels = convert_to_onehot(testLabels)

In [10]:
def convert_to_numpy(arr):
    a = arr.to_numpy()
    return a

train_numpy = convert_to_numpy(tdata)
test_numpy = convert_to_numpy(tedata)

In [11]:
def convert_data(x):
    x = x.reshape(1,8)
    return x

In [12]:
def return_gender(x):
    for val in x:
        if val[0] == 1:
            return 'Female'
        else:
            return 'Male'

In [15]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(train_numpy, oneHotTrainLabels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [16]:
some_data = tedata.iloc[:5]
some_labels = testLabels.iloc[:5]
some_data_prepared = some_data.to_numpy()

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [[ 0.80548691  0.19451309]
 [ 1.36394872 -0.36394872]
 [ 0.26328466  0.73671534]
 [-0.12776751  1.12776751]
 [ 0.51767414  0.48232586]]


In [17]:
one_some_labels = convert_to_onehot(some_labels)
print("Labels:", one_some_labels)

Labels: [[1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]


In [18]:
from sklearn.metrics import mean_squared_error

findMSE = test_numpy

predictions = lin_reg.predict(findMSE)
lin_mse = mean_squared_error(oneHotTestLabels, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.30147954587104947

In [19]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(train_numpy, oneHotTrainLabels)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=42, splitter='best')

In [20]:
some_data = tedata.iloc[:5]
some_labels = testLabels.iloc[:5]
some_data_prepared = convert_to_numpy(some_data)

print("Predictions:", tree_reg.predict(some_data_prepared))

Predictions: [[1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [21]:
print("labels:", some_labels)

labels:      Gender
625  female
9    female
81     male
393    male
327  female


In [22]:
from sklearn.metrics import mean_squared_error

findMSE = test_numpy

predictions = lin_reg.predict(findMSE)
lin_mse = mean_squared_error(oneHotTestLabels, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.30147954587104947

In [23]:
from numpy import array

'Shopping','Pop','PC','Dancing','Sci-fi','Life struggles','Romantic','Theatre'

my_vals = array([5,5,1,5,1,2,3,2])

x = convert_data(my_vals)

In [24]:
gender = tree_reg.predict(x)

return_gender(gender)

'Female'

In [41]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'max_features': [2, 4, 6, 8]},
    {'max_features': [2, 3, 4]},
    {'max_depth': [10]},
    {'max_leaf_nodes': [600]},
]
tree_reg = DecisionTreeRegressor(random_state=48)

grid_search = GridSearchCV(tree_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

#grid_search.fit(train_numpy, oneHotTrainLabels)

In [42]:
grid_search.fit(train_numpy, oneHotTrainLabels)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=48, splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'max_features': [2, 4, 6, 8]},
                         {'max_features': [2, 3, 4]}, {'max_depth': [10]},
                         {'max_leaf_nodes': [600]}],
             pre_

In [43]:
grid_search.best_estimator_

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=10,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=48, splitter='best')

In [45]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.5370861555295746 {'max_features': 2}
0.5244044240850758 {'max_features': 4}
0.49516896883014355 {'max_features': 6}
0.5231806081249082 {'max_features': 8}
0.5370861555295746 {'max_features': 2}
0.49516896883014355 {'max_features': 3}
0.5244044240850758 {'max_features': 4}
0.49399254688184646 {'max_depth': 10}
0.5117215789459874 {'max_leaf_nodes': 600}
