# How to apply predictions to the entire data set

In [8]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import numpy as np

np.random.seed(123)
n = 1000

df = pd.DataFrame({
    'id': np.arange(n) + 10001,
    'x1': np.random.uniform(size=n),
    'x2': np.random.uniform(size=n),
    'target': np.random.choice([1, 0], n)
})
df

Unnamed: 0,id,x1,x2,target
0,10001,0.696469,0.081894,0
1,10002,0.286139,0.495040,1
2,10003,0.226851,0.288890,0
3,10004,0.551315,0.639992,0
4,10005,0.719469,0.499936,0
...,...,...,...,...
995,10996,0.264397,0.193012,1
996,10997,0.690915,0.731650,1
997,10998,0.347146,0.274711,0
998,10999,0.004168,0.721818,1


In [9]:
train_and_val, test = train_test_split(df, test_size=.15, random_state=123)
train, validate = train_test_split(train_and_val, test_size=.2, random_state=123)

I have seperate train, validate, and test datasets and I want to:

1. Find our best model
1. Use the best model to make predictions for the entire dataset 

After some work and iteration (omitted in this notebook), our best model is a Decision tree w/ max_depth = 2

In [21]:
# this cell creates the best model
# maybe your x-y split looks different here, that's okay
X_train = train[['x1', 'x2']]
y_train = train.target

clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

`clf` is the variable that holds our best model

next we apply the model to the entire dataframe

In [22]:
X = df[['x1', 'x2']]

# Caveat: X needs to have the same columns as the X_train that was used to fit the model

# model.predict(X) gives us predictions for the entire dataframe
df['prediction'] = clf.predict(X)
df

Unnamed: 0,id,x1,x2,target,prediction
0,10001,0.696469,0.081894,0,1
1,10002,0.286139,0.495040,1,0
2,10003,0.226851,0.288890,0,0
3,10004,0.551315,0.639992,0,0
4,10005,0.719469,0.499936,0,0
...,...,...,...,...,...
995,10996,0.264397,0.193012,1,1
996,10997,0.690915,0.731650,1,0
997,10998,0.347146,0.274711,0,0
998,10999,0.004168,0.721818,1,0


Create a `predictions.csv` file with just an id and prediction column. This file will be created in the same directory as this notebook.

In [20]:
df[['id', 'prediction']].to_csv('predictions.csv')