In [None]:
#Import Data from an external file
#NEEDS: <FILE_STRING>, A string filename (or full filepath if not saved in same folder)
    #Your file should include your data in CSV format (can include header row)
    #This approach assumes your features and your outcomes are in the same file
#RESULTS: <YOUR_DATASET>, a DataFrame variable containing your imported information
    #See DataFrame documentation for details: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html

from pandas import read_csv
<YOUR_DATASET> = read_csv(<FILE_STRING>)

In [None]:
#Import Data from a Python (SKLearn) Dataset
#NEEDS: <DATASET_FUNCTION>, A function name for the dataset you need
    #See the datasets in the SKLearn documentation 7.1 to 7.4: https://scikit-learn.org/stable/datasets.html
    #This approach assumes you want your output as a Pandas DataFrame
#RESULTS: <YOUR_DATASET>, a DataFrame variable containing your imported information
    #See DataFrame documentation for details: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html

import sklearn.datasets
<YOUR_DATASET>_Bunch = sklearn.datasets.<DATASET_FUNCTION>(as_frame=True)
<YOUR_DATASET> = <YOUR_DATASET>_Bunch.frame

In [None]:
#Normalize numeric features in a dataset
#NEEDS: <YOUR_DATASET>, a DataFrame variable containing your feature information
    #This approach assumes all features are numeric
    #This approach assumes your final column (column -1) is your Outcome variable, and is ignored for Normalization
        #This choice is most appropriate for Classification
        #You may want to adjust this approach for regression (but remember to reverse it upon interpretation)
#RESULTS: This functionality performs the adjustments within the DataFrame.
    

<YOUR_DATASET>.iloc[:,0:-1] = <YOUR_DATASET>.iloc[:,0:-1].apply(lambda x: (x-x.mean())/ x.std(), axis=0)

In [None]:
#An alternate way to normalize features in your dataset
    #This method is useful if you want to normalize new values to a previously set mean and Std. Dev.
#NEEDS: A dataframe with data to normalize
    #This approach assumes all numeric features can be normalized
#RESULTS: This functionality performs the adjustments within the DataFrame.
#RESULTS: myNormalizer - a new Object you can reuse to perform Normalization to the same Mean and Std. Dev.
#RESULTS: <YOUR_DATASET>_dum_cats - a list of the one-hot dummy columns for your data, to be used for later One-Hot Encoding
  
import sklearn.preprocessing
    
<YOUR_DATASET>_num = <YOUR_DATASET>.select_dtypes(include=[numpy.number]) #Pare down your variables to only the numeric ones

#Phase 1: Initialize and fit a Normalizer
    #When complete, you can later use your Normalizer to transform other Datasets by repeating Phase 2
myNormalizer = sklearn.preprocessing.Normalizer().fit(<YOUR_DATASET>_num.to_numpy())

#Phase 2: Transform your numeric data and replace it in your original dataframe
<YOUR_DATASET>_num[:] = myNormalizer.transform(<YOUR_DATASET>_num.to_numpy())
<YOUR_DATASET>[<YOUR_DATASET>_num.columns] = <YOUR_DATASET>_num

In [None]:
#Split your dataset into Train & Test sets, set up separate variables for Features & Outcomes
#NEEDS: <YOUR_DATASET>, a DataFrame variable containing your feature information
#NEEDS: <OUTCOME_NAME> a string column name (or int column number) for where your Outcome variable lives
    #This approach assumes your Outcome variable is in the DataFrame
#RESULTS: train_y, train_X, test_y, test_X, four DataFrames containing portions of your Dataset
    #Those with _X contain features (i.e. everything except the Outcome variable)
    #Those with _y contain only the Outcome variable
    #Those with train_ are intended for training models
    #Those with test_ are intended for testing models

import sklearn.model_selection
    
train, test = sklearn.model_selection.train_test_split(<YOUR_DATASET>)

train_y = train[<OUTCOME_NAME>]
train_X = train.drop(<OUTCOME_NAME>,1)
test_y = test[<OUTCOME_NAME>]
test_X = test.drop(<OUTCOME_NAME>,1)

In [None]:
#Encode categorical data into an ordinal numeric variable
#NEEDS: <YOUR_DATASET>, a DataFrame variable containing your feature information
#NEEDS: train_X, test_X, DataFrames containing portions of your Dataset
    #You could do your ordinal encoding before you do a train-test split if you like
#RESULTS: train_X_ordinal, test_X_ordinal, DataFrames containing portions of your Dataset
    #You could just save into the original variables if you don't otherwise plan to use them

ordinal_encoder = sklearn.preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
ordinal_encoder.fit(<YOUR_DATASET>.drop(<OUTCOME_NAME>,1))
train_X_ordinal = ordinal_encoder.transform(train_X)
test_X_ordinal = ordinal_encoder.transform(test_X)

In [None]:
#Encode categorical data into a series of one-hot numeric variables (dummy variables)
#NEEDS: <YOUR_DATASET>, a DataFrame variable containing your feature information
#ASSUMES: All categorical variables are of type 'object' in your DataFrame
    #This will naturally occur for strings and similar non-numeric data
#RESULTS: <YOUR_DATASET>_dummies DataFrame with all possible category options as dummy variables
    #You could just save into the original variable if you don't otherwise plan to use it

import pandas
<YOUR_DATASET>_dummies = pandas.get_dummies(<YOUR_DATASET>, prefix='', prefix_sep='', 
                            columns=<YOUR_DATASET>.select_dtypes(include=['object']).columns.tolist())

In [None]:
#An alternate way to create one-hot dummy variables
    #This is useful if you want to encode new values to the same one-hot dummy variables
        #even if they contain new, never before seen values for the original variable
#NEEDS: A dataframe to encode as One-Hot dummy variables
#RESULTS: This functionality performs the adjustments within the DataFrame.
#RESULTS: myOneHotEncoder - a new Object you can reuse to perform One-Hot Encoding to the same set of dummy variables
#RESULTS: <YOUR_DATASET>_dum_cats - a list of the one-hot dummy columns for your data, to be used for later One-Hot Encoding
    
import sklearn.preprocessing
    
<YOUR_DATASET>_cat = <YOUR_DATASET>.select_dtypes(include=[numpy.object]) #Isolate categorical variables if necessary
    
#Phase 1: Initialize and fit a OneHotEncoder
    #When complete, you can later use your myOneHotEncoder to transform other Datasets by repeating Phase 2

<YOUR_DATASET>_cat_cols = <YOUR_DATASET>_cat.columns
myOneHotEncoder = sklearn.preprocessing.OneHotEncoder(handle_unknown = "ignore", sparse = False)
myOneHotEncoder.fit(<YOUR_DATASET>_cat.to_numpy())

#Phase 2: Transform your categorical data and replace it in your original dataframe

<YOUR_DATASET>_cat_dum = myOneHotEncoder.transform(<YOUR_DATASET>_cat.to_numpy())
<YOUR_DATASET>_dum_cats = myOneHotEncoder.get_feature_names() #You can reuse this same variable for your categories later
<YOUR_DATASET>_cat_frame = pandas.DataFrame(babyData_train_cat_dum)
<YOUR_DATASET>[<YOUR_DATASET>_dum_cats] = <YOUR_DATASET>_cat_frame
<YOUR_DATASET> = <YOUR_DATASET>.drop(<YOUR_DATASET>_cat_cols,1) #You must remove the old columns before training a classifier

In [None]:
#Intialize, fit (train), and score (test) a classifier of your choice 
#NEEDS: A particular model constructor call from SKLearn
    # NEEDS: The constructor will be within an SKLearn module - you need to import it
#NEEDS: train_y, train_X, test_y, test_X, four DataFrames containing portions of your Dataset

<IMPORT_FOR_CLASSIFIER>
classifier = <YOUR_SKLEARN_MODEL_CONSTRUCTOR>
classifier.fit(train_X, train_y)
print(classifier.score(test_X, test_y))