In [None]:
# Encode labels as categories
sample_data.label.head()
sample_data.label = sample_data.label.asdtype('category')
# Dummy variable encoding 
dummies = pd.get_dummies(sample_df[['label']], prefix_sep = '_'）

In [2]:
# Lambda functions 
square = lambda x: x*x 
square(2)

4

In [None]:
# define function 
categorize_label = lambda x: x.astype('category')
# apply function 
sample_df.label = sample_df[['label']].apply(categorize_label, axis = 0)
sample_df.info()

In [None]:
# computing log loss with Numpy 
# logloss.py
import numpy as np
def compute_log_loss(predicted, actual, eps = 1e-14):
    predicted = np.clip(predicted, eps, 1 = eps)
    loss = -1 * np.mean(actual * np.log(predicted)
                       + (1 - actual)
                       * np.log(1 - predicted))
    return loss

In [None]:
# Compute and print log loss for 1st case
correct_confident_loss = compute_log_loss(correct_confident, actual_labels)
print("Log loss, correct and confident: {}".format(correct_confident_loss)) 

# Compute log loss for 2nd case
correct_not_confident_loss = compute_log_loss(correct_not_confident, actual_labels)
print("Log loss, correct and not confident: {}".format(correct_not_confident_loss)) 

# Compute and print log loss for 3rd case
wrong_not_confident_loss = compute_log_loss(wrong_not_confident, actual_labels)
print("Log loss, wrong and not confident: {}".format(wrong_not_confident_loss)) 

# Compute and print log loss for 4th case
wrong_confident_loss = compute_log_loss(wrong_confident, actual_labels)
print("Log loss, wrong and confident: {}".format(wrong_confident_loss)) 

# Compute and print log loss for actual labels
actual_labels_loss = compute_log_loss(actual_labels, actual_labels)
print("Log loss, actual labels: {}".format(actual_labels_loss)) 

### Spliting the multi-class datasets
* StratifiedShuffleSplit

In [None]:
# Spliting the data
data_to_train = df[NUMERIC_COLUMNS].fillna(-1000)
labels_to_use = pd.get_dummies(df[LABELS])
X_train, X_test, y_train, y_test = multilabel_train_test_split(
                                    data_to_train,
                                    labels_to_use, 
                                    size = 0.2, seed = 123)
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier 
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, y_train)

In [None]:
### Making predictions 
clf = OneVsRestClassifier(LogisticRegression())
hold_out = df[NUMERIC_COLUMNS].fillna(-1000)
predictions = clf.predict_proba(hold_out)
prediction_df = pd.DataFrame(columns = pd.get_dummies(df[LABELS]),prefix_sep = '__').columns,
                             index = hold_out.index,
                             data = predictions)
score = score_submission(pred_path = 'predictions.csv')

### A very brief introduction to NLP
* Tokenization: Spliting a string into segments 
    
### Bag of words representation
* Count the number of times a particular token appears 
* This approach discards information about word order

### n-grams

### Representing text numerically 
* Bag-of-words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
TOKEN_BASIC = '\\\\S+(?=\\\\s+)'
df.Program_Description.fillna('', inplace = True)
vec_basic = CountVectorizer(token_pattern = TOKEN_BASIC)
vec_basic.fit(df.Program_Description)

### Pipline, feature & text processing 

In [None]:
# Instantiate simple pipline with one step
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier 
pl = Pipeline([
    ('clf', OneVsRestClassifier(LogisticRegression()))
])

# Train and test with sample numeric data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
                                    sample_df[['numeric']],
                                    pd.get_dummies( sample_df[['label']]), 
                                    random_state = 2)
pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

# Adding more steps to the pipline 
from sklearn.preprocessing import Imputer
X_train, X_test, y_train, y_test = train_test_split(
                                    sample_df[['numeric', 'with_missing']],
                                    pd.get_dummies( sample_df[['label']]), 
                                    random_state = 2)
# Processing numeric features with missing data 
pl = Pipeline([
    ('imp', Imputer()),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])
pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

### Text features and feature unions 

In [None]:
from sklearn.multiclass import CountVectorizer 
from sklearn.feature_extraction.text import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
                                    sample_df[['text']],
                                    pd.get_dummies( sample_df[['label']]), 
                                    random_state = 2)
pl = Pipeline([
    ('vec', CountVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])
pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

### Preprocessing mutiple dtypes
* FunctionTransformer: 
    Turns a Python function into a object that sklearn pipline would understand 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                    sample_df[['numeric', 'with_missing', 'text']],
                                    pd.get_dummies( sample_df[['label']]), 
                                    random_state = 2)
from sklearn.preprocessing import FunctionTransformer
get_text_data = FunctionTransformer(lambda x: x['text'], 
                                    validate = False)
get_numeric_data = FeatureUnion(lambda x: x[['numeric', 'with_missing']],
                                validate = False)
# Fit and transform the text data: just_text_data
just_text_data = get_text_data.fit_transform(sample_df)

# Fit and transform the numeric data: just_numeric_data
just_numeric_data = get_numeric_data.fit_transform(sample_df)

In [None]:
from sklearn.pipeline import FeatureUnion 
union = FeatureUnion([
    ('numeric', numeric_pipeline),
    ('text', text_pipepline)
])

In [None]:
# Putting it all together 
numeric_pipeline = Pipeline([
    ('selector', get_numeric_data),
    ('imputer', Imputer())
    ])

text_pipeline = Pipline([
    ('selector', get_text_data),
    ('vectorizer', CountVctorizer())
    ])

pl = Pipeline([
    ('union', FeatureUnion([
        ('numeric', numeric_pipline),
        ('text', text_pipline)
    ])),
    ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [None]:
# Choosing a classification model 
LABELS = ['Function', 'Use', 'Sharing', 'Reporting', 'Student_Type', 
          'Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status']
NON_LABELS = [c for c in df.columns if c not in LABELS]
len(NON_LABELS) - len(NUMERIC_COLUMNS)

In [None]:
# Using pipeline with the main dataset
import numpy as np 
import pandas as pd
df = pd.read_csv('TrainingSetSample.csv', index_col = 0)
dummy_labels = pd.get_dummies(df[LABELS])
X_train, X_test, y_train, y_test = multilabel_train_test_split(
                                   df[NON_LABELS], dummy_labels, 0.2)
get_text_data = FunctionTransformer(combine_text_columns,
                                    validate = False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS],
                                      validate = False)
pl = Pipeline([
        ('union', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('imputer', Imputer())
            ])),
            ('text_features', Pipeline([
                ('selector', get_text_data),
                ('vectorizer', CountVctorizer())
            ]))
        ])
    ),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])
pl.fit(X_train, y_train)

In [None]:
# Easily try new models using pipeline 
from sklearn.emsemble import RandomForestClassifier
pl = Pipeline([
        ('union', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('imputer', Imputer())
            ])),
            ('text_features', Pipeline([
                ('selector', get_text_data),
                ('vectorizer', CountVctorizer())
            ]))
        ])
    ),
    ('clf', OneVsRest(RandomForestClassifier()))
])

In [None]:
# n-grams and tokenization
vec = CountVectorizer(token_pattern = TOKEN_ALPHANUMERIC, 
                      ngram_range = (1, 2))

In [None]:
# Import the CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create the text vector
text_vector = combine_text_columns(X_train)

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate the CountVectorizer: text_features
text_features = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC)

# Fit text_features to the text vector
text_features.fit(text_vector)

# Print the first 10 tokens
print(text_features.get_feature_names()[:10])

In [None]:
# Import pipeline
from sklearn.pipeline import Pipeline

# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Import other preprocessing modules
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import chi2, SelectKBest

# Select 300 best features
chi_k = 300

# Import functional utilities
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import FeatureUnion

# Perform preprocessing
get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate pipeline: pl
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                   ngram_range=(1, 2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

### Interaction terms 

In [None]:
# Implement interaction modeling in scikit-learn
# Instantiate pipeline: pl
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                   ngram_range=(1, 2))),  
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree = 2)),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

### Hash trick 
* Useful on large datasets

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer 
vec = HashingVectorizer(norm = None, 
                        non_negative = True,
                        token_pattern = TOKENS_ALPHANUMERIC,
                        ngram_range = (1, 2))

### About Random Forest 
https://www.youtube.com/watch?v=loNcrMjYh64