In [56]:
# import libraries
import numpy as np 
import pandas  as pd
import matplotlib.pyplot as plt

import seaborn as sns

plt.style.use('classic')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif

In [57]:
# read data
Train_df = pd.read_csv("Train.csv")
Test_df = pd.read_csv("Test (1).csv")
sup = pd.read_csv("SampleSubmission.csv")
Train_df.head()

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
0,ID_SYSJ2FM0D,2022.0,2022-02-03,59.0,,,,,,Sometimes,...,,,,,,,,,,51.5
1,ID_J5BTFOZR3,2019.0,,60.163933,,,,1st year in the programme,103.0,Sometimes,...,,,,,,,,,,55.869999
2,ID_R00SN7AUD,2022.0,2022-03-11,69.0,,,,,108.400002,Often,...,,,,,,,,,,47.52
3,ID_BSSK60PAZ,2021.0,2021-10-13,53.0,2020-01-15,20.0,No,1st year in the programme,98.099998,Almost always,...,,,,,,,,,,58.599998
4,ID_IZTY6TC4D,2021.0,2021-10-13,57.0,2021-10-13,0.0,,2nd year in programme,114.0,Almost always,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,76.599998


In [58]:
# check the shape of the data
print('Train data shape: ', Train_df.shape)

Train data shape:  (8585, 679)


In [59]:
Train_df.isnull().sum()

child_id                   0
data_year                  0
child_date              1821
child_age                  0
child_enrolment_date    5964
                        ... 
obs_heating_4           6324
obs_heating_5           6324
obs_heating_6           6324
obs_heating_7           6324
target                     0
Length: 679, dtype: int64

In [60]:
# check the null values percentage
Train_df_null = Train_df.isnull().sum()/Train_df.shape[0]*100
Train_df_null

child_id                 0.000000
data_year                0.000000
child_date              21.211415
child_age                0.000000
child_enrolment_date    69.470006
                          ...    
obs_heating_4           73.663366
obs_heating_5           73.663366
obs_heating_6           73.663366
obs_heating_7           73.663366
target                   0.000000
Length: 679, dtype: float64

In [61]:
# Below code gives list of columns having more than 60% null
col_to_drop = Train_df_null[Train_df_null>40].keys()

Train_df = Train_df.drop(col_to_drop, axis=1)

In [62]:
Train_df.shape

(8585, 141)

In [63]:
# label encoding the categorical variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in Train_df.columns:
    if Train_df[col].dtype == 'object':
        Train_df[col] = le.fit_transform(Train_df[col])

In [64]:
#fill the missing 
Train_df.fillna(Train_df.mean(), inplace=True)

In [65]:
Train_df.isnull().sum()

child_id       0
data_year      0
child_date     0
child_age      0
child_grant    0
              ..
id_mn_n        0
id_dc_n        0
id_prov_n      0
ses_cat        0
target         0
Length: 141, dtype: int64

In [66]:
Train_df.corr()['target'].sort_values(ascending=False).head(20)

target                   1.000000
child_observe_total      0.428263
child_age                0.424989
child_age_group          0.367114
pri_holidays             0.287287
pra_groupings_2          0.267436
child_height             0.266732
pri_language_9           0.263660
pri_transport            0.263509
pri_language_3           0.262635
pri_mobile               0.262334
pri_funding_donations    0.261828
pri_funding_97           0.260761
pri_language_2           0.258409
pri_language_11          0.257758
pra_free_play            0.257439
pri_aftercare            0.257294
pri_school               0.257110
pri_toys                 0.256882
pri_language_1           0.256521
Name: target, dtype: float64

In [67]:
col1 = ["child_id"]

col2 = [
        "child_observe_total",
        "child_age",
        "child_age_group",
        "pri_holidays",
        "pra_groupings_2",
        "child_height",
        "pri_language_9",
        "pri_language_3",
        "pri_mobile",
        "pri_funding_donations",
        "pri_funding_97",
        "pri_language_2",
        "pri_language_11",
        "pra_free_play",
        "pri_aftercare",
        "pri_school",
        "pri_toys",
        "pri_language_1"
]

target = ["target"]
feat = col1 + col2
Train_df = Train_df[feat + target]
Test_df = Test_df[feat]

In [68]:
Train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8585 entries, 0 to 8584
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   child_id               8585 non-null   int32  
 1   child_observe_total    8585 non-null   float64
 2   child_age              8585 non-null   float64
 3   child_age_group        8585 non-null   int32  
 4   pri_holidays           8585 non-null   int32  
 5   pra_groupings_2        8585 non-null   int32  
 6   child_height           8585 non-null   float64
 7   pri_language_9         8585 non-null   int32  
 8   pri_language_3         8585 non-null   int32  
 9   pri_mobile             8585 non-null   int32  
 10  pri_funding_donations  8585 non-null   int32  
 11  pri_funding_97         8585 non-null   int32  
 12  pri_language_2         8585 non-null   int32  
 13  pri_language_11        8585 non-null   int32  
 14  pra_free_play          8585 non-null   int32  
 15  pri_

In [69]:
y = Train_df['target']

In [70]:
x = Train_df.drop('target',axis=1)

In [71]:
x.head()

Unnamed: 0,child_id,child_observe_total,child_age,child_age_group,pri_holidays,pra_groupings_2,child_height,pri_language_9,pri_language_3,pri_mobile,pri_funding_donations,pri_funding_97,pri_language_2,pri_language_11,pra_free_play,pri_aftercare,pri_school,pri_toys,pri_language_1
0,6911,4.0,59.0,0,2,2,106.949312,2,2,2,2,2,2,2,5,2,2,1,2
1,4546,4.0,60.163933,1,2,2,103.0,2,2,2,2,2,2,2,5,2,2,1,2
2,6402,7.0,69.0,1,2,2,108.400002,2,2,2,2,2,2,2,5,2,2,1,1
3,2835,9.0,53.0,0,1,0,98.099998,0,0,0,0,0,1,0,3,1,0,0,0
4,4512,12.0,57.0,0,0,1,114.0,0,0,0,0,0,0,0,0,0,0,0,0


In [72]:
x = x.drop('child_id',axis=1)

In [73]:
x.head()

Unnamed: 0,child_observe_total,child_age,child_age_group,pri_holidays,pra_groupings_2,child_height,pri_language_9,pri_language_3,pri_mobile,pri_funding_donations,pri_funding_97,pri_language_2,pri_language_11,pra_free_play,pri_aftercare,pri_school,pri_toys,pri_language_1
0,4.0,59.0,0,2,2,106.949312,2,2,2,2,2,2,2,5,2,2,1,2
1,4.0,60.163933,1,2,2,103.0,2,2,2,2,2,2,2,5,2,2,1,2
2,7.0,69.0,1,2,2,108.400002,2,2,2,2,2,2,2,5,2,2,1,1
3,9.0,53.0,0,1,0,98.099998,0,0,0,0,0,1,0,3,1,0,0,0
4,12.0,57.0,0,0,1,114.0,0,0,0,0,0,0,0,0,0,0,0,0


In [74]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)

In [75]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [76]:
lm.fit(X_train,y_train)

LinearRegression()

In [77]:
# The coefficients
print('Coefficients: \n', lm.coef_)

Coefficients: 
 [ 1.89774646  0.81325764  0.94444489  3.240761    1.06656457  0.31338666
  2.52845723  1.18793839  1.07856105  1.78456773 -1.41327627  0.30156851
 -2.58984916  0.3884164  -0.45216616 -0.54706338 -8.72405898 -0.65543159]


In [78]:
predictions = lm.predict( X_test)

In [30]:
#plt.scatter(y_test,predictions)
#plt.xlabel('Y Test')
#plt.ylabel('Predicted Y')

In [79]:
# calculate these metrics
from sklearn import metrics
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

RMSE: 12.22444392404074


In [39]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.1.1-cp39-none-win_amd64.whl (74.0 MB)
     ---------------------------------------- 74.0/74.0 MB 3.1 MB/s eta 0:00:00
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
     -------------------------------------- 47.0/47.0 kB 112.2 kB/s eta 0:00:00
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.1.1 graphviz-0.20.1


In [80]:
from catboost import CatBoostRegressor

In [81]:
cb_model = CatBoostRegressor(iterations=30000,
                             learning_rate=0.045,
                             depth=8,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=300)
cb_model.fit(x, y,
             use_best_model=True,
             verbose=50)

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 14.9648608	total: 435ms	remaining: 3h 37m 40s
50:	learn: 11.9423832	total: 951ms	remaining: 9m 18s
100:	learn: 11.6265568	total: 1.39s	remaining: 6m 51s
150:	learn: 11.4865590	total: 1.8s	remaining: 5m 55s
200:	learn: 11.3619138	total: 2.21s	remaining: 5m 28s
250:	learn: 11.2221040	total: 2.65s	remaining: 5m 13s
300:	learn: 11.0953228	total: 3.08s	remaining: 5m 3s
350:	learn: 10.9696132	total: 3.51s	remaining: 4m 56s
400:	learn: 10.8577828	total: 3.99s	remaining: 4m 54s
450:	learn: 10.7590138	total: 4.43s	remaining: 4m 50s
500:	learn: 10.6674848	total: 4.86s	remaining: 4m 46s
550:	learn: 10.5769856	total: 5.29s	remaining: 4m 42s
600:	learn: 10.4950299	total: 5.72s	remaining: 4m 39s
650:	learn: 10.4116155	total: 6.15s	remaining: 4m 37s
700:	learn: 10.3379832	total: 6.58s	remaining: 4m 35s
750:	learn: 10.2625356	total: 7.02s	remaining: 4m 33s
800:	learn: 10.1943811	total: 7.45s	remaining: 4m 31s
850:	learn: 10.1110963	total: 7.88s	remaining: 4m 29s
900:	learn: 10.0494981	total:

7650:	learn: 6.5119311	total: 1m 6s	remaining: 3m 14s
7700:	learn: 6.5010249	total: 1m 7s	remaining: 3m 14s
7750:	learn: 6.4895312	total: 1m 7s	remaining: 3m 13s
7800:	learn: 6.4789748	total: 1m 7s	remaining: 3m 13s
7850:	learn: 6.4674004	total: 1m 8s	remaining: 3m 13s
7900:	learn: 6.4554261	total: 1m 8s	remaining: 3m 12s
7950:	learn: 6.4438706	total: 1m 9s	remaining: 3m 12s
8000:	learn: 6.4324405	total: 1m 9s	remaining: 3m 12s
8050:	learn: 6.4220834	total: 1m 10s	remaining: 3m 11s
8100:	learn: 6.4096150	total: 1m 10s	remaining: 3m 11s
8150:	learn: 6.3987849	total: 1m 11s	remaining: 3m 10s
8200:	learn: 6.3871727	total: 1m 11s	remaining: 3m 10s
8250:	learn: 6.3766033	total: 1m 11s	remaining: 3m 9s
8300:	learn: 6.3643902	total: 1m 12s	remaining: 3m 9s
8350:	learn: 6.3519159	total: 1m 12s	remaining: 3m 8s
8400:	learn: 6.3396277	total: 1m 13s	remaining: 3m 8s
8450:	learn: 6.3282610	total: 1m 13s	remaining: 3m 7s
8500:	learn: 6.3180745	total: 1m 14s	remaining: 3m 7s
8550:	learn: 6.3048260	t

15100:	learn: 5.2124267	total: 2m 12s	remaining: 2m 11s
15150:	learn: 5.2063596	total: 2m 13s	remaining: 2m 10s
15200:	learn: 5.2008521	total: 2m 13s	remaining: 2m 10s
15250:	learn: 5.1955700	total: 2m 14s	remaining: 2m 9s
15300:	learn: 5.1896237	total: 2m 14s	remaining: 2m 9s
15350:	learn: 5.1837056	total: 2m 15s	remaining: 2m 8s
15400:	learn: 5.1780807	total: 2m 15s	remaining: 2m 8s
15450:	learn: 5.1726083	total: 2m 15s	remaining: 2m 8s
15500:	learn: 5.1674529	total: 2m 16s	remaining: 2m 7s
15550:	learn: 5.1628017	total: 2m 16s	remaining: 2m 7s
15600:	learn: 5.1583849	total: 2m 17s	remaining: 2m 6s
15650:	learn: 5.1527826	total: 2m 17s	remaining: 2m 6s
15700:	learn: 5.1474251	total: 2m 18s	remaining: 2m 5s
15750:	learn: 5.1416801	total: 2m 18s	remaining: 2m 5s
15800:	learn: 5.1352720	total: 2m 18s	remaining: 2m 4s
15850:	learn: 5.1304779	total: 2m 19s	remaining: 2m 4s
15900:	learn: 5.1262466	total: 2m 19s	remaining: 2m 3s
15950:	learn: 5.1202703	total: 2m 20s	remaining: 2m 3s
16000:	

22500:	learn: 4.5542785	total: 3m 21s	remaining: 1m 7s
22550:	learn: 4.5506792	total: 3m 21s	remaining: 1m 6s
22600:	learn: 4.5468402	total: 3m 21s	remaining: 1m 6s
22650:	learn: 4.5430061	total: 3m 22s	remaining: 1m 5s
22700:	learn: 4.5389568	total: 3m 22s	remaining: 1m 5s
22750:	learn: 4.5347610	total: 3m 23s	remaining: 1m 4s
22800:	learn: 4.5319543	total: 3m 23s	remaining: 1m 4s
22850:	learn: 4.5282717	total: 3m 24s	remaining: 1m 3s
22900:	learn: 4.5255863	total: 3m 24s	remaining: 1m 3s
22950:	learn: 4.5218701	total: 3m 24s	remaining: 1m 2s
23000:	learn: 4.5183629	total: 3m 25s	remaining: 1m 2s
23050:	learn: 4.5153719	total: 3m 25s	remaining: 1m 2s
23100:	learn: 4.5125384	total: 3m 26s	remaining: 1m 1s
23150:	learn: 4.5095349	total: 3m 26s	remaining: 1m 1s
23200:	learn: 4.5063325	total: 3m 27s	remaining: 1m
23250:	learn: 4.5023966	total: 3m 27s	remaining: 1m
23300:	learn: 4.4993445	total: 3m 27s	remaining: 59.8s
23350:	learn: 4.4963063	total: 3m 28s	remaining: 59.3s
23400:	learn: 4.

<catboost.core.CatBoostRegressor at 0x2391e4be310>

In [82]:
Test_df.head()

Unnamed: 0,child_id,child_observe_total,child_age,child_age_group,pri_holidays,pra_groupings_2,child_height,pri_language_9,pri_language_3,pri_mobile,pri_funding_donations,pri_funding_97,pri_language_2,pri_language_11,pra_free_play,pri_aftercare,pri_school,pri_toys,pri_language_1
0,ID_0I0999N6S,11.0,57.0,50-59 months,No,Yes,108.0,No,No,Based at a specific location,No,No,Yes,No,30 minutes or less,No,No,No,No
1,ID_GQ6ONJ4FP,11.0,54.0,50-59 months,No,No,105.0,No,No,Based at a specific location,No,No,Yes,No,30 minutes or less,No,No,No,No
2,ID_YZ76CVRW3,8.0,57.0,50-59 months,Yes,No,101.5,No,No,Based at a specific location,No,No,Yes,No,Up to 2 hours,No,No,No,No
3,ID_BNINCRXH8,11.0,59.334702,50-59 months,Yes,No,,No,No,Based at a specific location,No,No,No,No,30 minutes or less,No,No,No,No
4,ID_1U7GDTLRI,9.0,54.0,50-59 months,Yes,Yes,103.5,No,No,Based at a specific location,No,No,No,No,30 minutes or less,No,No,No,Yes


In [28]:
!pip install eli5 -q

In [83]:
Train_df.head()

Unnamed: 0,child_id,child_observe_total,child_age,child_age_group,pri_holidays,pra_groupings_2,child_height,pri_language_9,pri_language_3,pri_mobile,pri_funding_donations,pri_funding_97,pri_language_2,pri_language_11,pra_free_play,pri_aftercare,pri_school,pri_toys,pri_language_1,target
0,6911,4.0,59.0,0,2,2,106.949312,2,2,2,2,2,2,2,5,2,2,1,2,51.5
1,4546,4.0,60.163933,1,2,2,103.0,2,2,2,2,2,2,2,5,2,2,1,2,55.869999
2,6402,7.0,69.0,1,2,2,108.400002,2,2,2,2,2,2,2,5,2,2,1,1,47.52
3,2835,9.0,53.0,0,1,0,98.099998,0,0,0,0,0,1,0,3,1,0,0,0,58.599998
4,4512,12.0,57.0,0,0,1,114.0,0,0,0,0,0,0,0,0,0,0,0,0,76.599998


In [84]:
import eli5

In [85]:
Test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3680 entries, 0 to 3679
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   child_id               3680 non-null   object 
 1   child_observe_total    3410 non-null   float64
 2   child_age              3680 non-null   float64
 3   child_age_group        3680 non-null   object 
 4   pri_holidays           2215 non-null   object 
 5   pra_groupings_2        2217 non-null   object 
 6   child_height           3019 non-null   float64
 7   pri_language_9         2217 non-null   object 
 8   pri_language_3         2217 non-null   object 
 9   pri_mobile             2225 non-null   object 
 10  pri_funding_donations  2216 non-null   object 
 11  pri_funding_97         2216 non-null   object 
 12  pri_language_2         2217 non-null   object 
 13  pri_language_11        2217 non-null   object 
 14  pra_free_play          2217 non-null   object 
 15  pri_

In [86]:
Train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8585 entries, 0 to 8584
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   child_id               8585 non-null   int32  
 1   child_observe_total    8585 non-null   float64
 2   child_age              8585 non-null   float64
 3   child_age_group        8585 non-null   int32  
 4   pri_holidays           8585 non-null   int32  
 5   pra_groupings_2        8585 non-null   int32  
 6   child_height           8585 non-null   float64
 7   pri_language_9         8585 non-null   int32  
 8   pri_language_3         8585 non-null   int32  
 9   pri_mobile             8585 non-null   int32  
 10  pri_funding_donations  8585 non-null   int32  
 11  pri_funding_97         8585 non-null   int32  
 12  pri_language_2         8585 non-null   int32  
 13  pri_language_11        8585 non-null   int32  
 14  pra_free_play          8585 non-null   int32  
 15  pri_

In [87]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in Test_df.columns:
    if Test_df[col].dtype == 'object':
        Test_df[col] = le.fit_transform(Test_df[col])

In [88]:
Test_df.head()

Unnamed: 0,child_id,child_observe_total,child_age,child_age_group,pri_holidays,pra_groupings_2,child_height,pri_language_9,pri_language_3,pri_mobile,pri_funding_donations,pri_funding_97,pri_language_2,pri_language_11,pra_free_play,pri_aftercare,pri_school,pri_toys,pri_language_1
0,47,11.0,57.0,0,0,1,108.0,0,0,0,0,0,1,0,0,0,0,0,0
1,1703,11.0,54.0,0,0,0,105.0,0,0,0,0,0,1,0,0,0,0,0,0
2,3564,8.0,57.0,0,1,0,101.5,0,0,0,0,0,1,0,4,0,0,0,0
3,1197,11.0,59.334702,0,1,0,,0,0,0,0,0,0,0,0,0,0,0,0
4,180,9.0,54.0,0,1,1,103.5,0,0,0,0,0,0,0,0,0,0,0,1


In [89]:
Test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3680 entries, 0 to 3679
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   child_id               3680 non-null   int32  
 1   child_observe_total    3410 non-null   float64
 2   child_age              3680 non-null   float64
 3   child_age_group        3680 non-null   int32  
 4   pri_holidays           3680 non-null   int32  
 5   pra_groupings_2        3680 non-null   int32  
 6   child_height           3019 non-null   float64
 7   pri_language_9         3680 non-null   int32  
 8   pri_language_3         3680 non-null   int32  
 9   pri_mobile             3680 non-null   int32  
 10  pri_funding_donations  3680 non-null   int32  
 11  pri_funding_97         3680 non-null   int32  
 12  pri_language_2         3680 non-null   int32  
 13  pri_language_11        3680 non-null   int32  
 14  pra_free_play          3680 non-null   int32  
 15  pri_

In [90]:
pred = (cb_model.predict(X_test))

In [91]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
print(f'RMSE Score: {mean_squared_error(y_test, pred, squared=False)}')
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

RMSE Score: 4.430203394780903
RMSE: 4.430203394780903


In [92]:
sup.head()

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
1,ID_GQ6ONJ4FP,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
2,ID_YZ76CVRW3,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
3,ID_BNINCRXH8,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
4,ID_1U7GDTLRI,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature


In [45]:
# Select a single/specific prediction
sample = X_train.iloc[[1]]
sample

Unnamed: 0,child_observe_total,child_age,child_age_group,pri_holidays,pra_groupings_2,child_height,pri_language_9,pri_language_3,pri_mobile,pri_funding_donations,pri_funding_97,pri_language_2,pri_language_11,pra_free_play,pri_aftercare
5310,6.0,66.0,1,2,2,106.949312,2,2,2,2,2,2,2,5,2


In [103]:
col2 = [
        "child_observe_total",
        "child_age",
        "child_age_group",
        "pri_holidays",
        "pra_groupings_2",
        "child_height",
        "pri_language_9",
        "pri_language_3",
        "pri_mobile",
        "pri_funding_donations",
        "pri_funding_97",
        "pri_language_2",
        "pri_language_11",
        "pra_free_play",
        "pri_aftercare"
]

In [104]:
col2 = np.array(col2)

In [105]:
reshaped_col2 = col2.reshape((1, 15))

In [109]:
# Submission file preparation
predictors = pd.DataFrame(reshaped_col2, columns = ['feature_' + str(i) for i in range(1,16)])

In [108]:
len(predictors)

1

In [110]:
len(pred)

2576