_Definition_ **Regression** := when a statistical function returns a floating point value.

_Definition_ **Classification** := when a statistical function returns a category.  Like a boolean or a string.

In [2]:
import pandas as pd

df = pd.read_csv("assignment_files/data.csv")
columns = [elem for elem in df.columns.tolist() if "Unnamed" not in elem]
df = df[columns]
df.head()

Unnamed: 0,a,b,c,d
0,-0.591023,16,90.296527,'molly'
1,0.626541,13,89.357373,'dolly'
2,-0.121086,23,95.636287,'molly'
3,0.691377,19,95.360784,'molly'
4,-0.403012,5,96.244526,'dolly'


In [4]:
from sklearn import linear_model
from sklearn import metrics
X = df[["a", "b", "c"]]
y = df["d"]
logit = linear_model.LogisticRegression()
model = logit.fit(X, y)
predicted = model.predict(X)
unique_elems = list(set(y))
mapping = {"'dolly'":0, "'molly'":1}
predicted = [mapping[elem] for elem in predicted]
y = [mapping[elem] for elem in y]
print(metrics.r2_score(y, predicted))
success_count = 0
for index, elem in enumerate(y):
    if predicted[index] == elem:
        success_count += 1
success_count / len(y)

-0.968053216159


0.508

In [5]:
from sklearn import tree
import random
X = df[["a", "b", "c"]]
y = df["d"]
decision_tree =tree.DecisionTreeClassifier()
model = decision_tree.fit(X, y)
predicted = model.predict(X)
unique_elems = list(set(y))
mapping = {"'dolly'":0, "'molly'":1}
predicted = [mapping[elem] for elem in predicted]
y = [mapping[elem] for elem in y]
print(metrics.accuracy_score(y, predicted))
success_count = 0
for index, elem in enumerate(y):
    if predicted[index] == elem:
        success_count += 1
print(success_count / len(y))
random_index = random.randint(0,len(y))
y[random_index] == predicted[random_index]

1.0
1.0


True

Notice that we make use of `accuracy_score` instead of `r_2` for the decision tree.  That's because `r_2` assumes a linear model.  It is possible for `r_2` to not be able produce accurate results for binary or multiclass classification.  

Now that we have a baseline for dealing with numbers, let's look at some more models!

In [27]:
from sklearn.neighbors import KNeighborsClassifier

X = df[["a", "b", "c"]]
y = df["d"]
clfs = []
fit_scores = []
for i in range(1,10):
    clfs.append(KNeighborsClassifier(n_neighbors=i))
for clf in clfs:
    model = clf.fit(X, y)
    predicted = model.predict(X)
    unique_elems = list(set(y))
    mapping = {"'dolly'":0, "'molly'":1}
    predicted_vals = [mapping[elem] for elem in predicted]
    y_vals = [mapping[elem] for elem in y]
    fit_scores.append(metrics.accuracy_score(y_vals, predicted_vals))
for index,i in enumerate(list(range(1,10))):
    print("The accuracy of KNN with a k of {} is {}".format(i, fit_scores[index]))

The accuracy of KNN with a k of 1 is 1.0
The accuracy of KNN with a k of 2 is 0.7545
The accuracy of KNN with a k of 3 is 0.7504
The accuracy of KNN with a k of 4 is 0.6857
The accuracy of KNN with a k of 5 is 0.6858
The accuracy of KNN with a k of 6 is 0.6603
The accuracy of KNN with a k of 7 is 0.6601
The accuracy of KNN with a k of 8 is 0.6421
The accuracy of KNN with a k of 9 is 0.6359


So we see that with a k of two the data is fit the best.

In [29]:
from sklearn.svm import SVC
import random
X = df[["a", "b", "c"]]
y = df["d"]
clf = SVC(random_state=0, kernel='rbf', probability=True)
model = clf.fit(X, y)
predicted = model.predict(X)
unique_elems = list(set(y))
mapping = {"'dolly'":0, "'molly'":1}
predicted = [mapping[elem] for elem in predicted]
y = [mapping[elem] for elem in y]
print(metrics.accuracy_score(y, predicted))
success_count = 0
for index, elem in enumerate(y):
    if predicted[index] == elem:
        success_count += 1
print(success_count / len(y))
random_index = random.randint(0,len(y))
y[random_index] == predicted[random_index]

0.6269
0.6269


True

For a wide range of classification tasks, usually SVM performs quiet well.  However, it does not here.  We have now seen a set of classification algorithms.  Now we are ready to look at ensemble approaches - combining many weak classifiers to create a strong classifier.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn import metrics, feature_selection
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from itertools import product
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier



X = df[["a", "b", "c"]]
y = df["d"]

# Plotting Decision Regions
labels = ['Logistic Regression',
          'Random Forest',
          'RBF kernel SVM',
          'Ensemble']


clf1 = KNeighborsClassifier(n_neighbors=7) #strong
clf2 = SVC(random_state=0, kernel='rbf', probability=True) #strong
clf3 = SVC(gamma=2, C=1, probability=True) #strong
clf4 = RandomForestClassifier(random_state=0) #strong


estimators = [
    ('knn', clf1), ('svm', clf2),
    ('svm_gamma', clf3), ('rf', clf4), 
]

eclf = VotingClassifier(estimators=estimators, voting='soft')

params = {
    'rf__n_estimators': [20, 50, 100],
    'knn__n_neighbors': [2,3,4,5,6,7,8,9,10,11,12],
}

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)

for label, clf in estimators:
    print(label)
    if label == 'knn':
        params = {"n_neighbors": [2,3,4,5,6,7,8,9,10,11,12]}
        clf = GridSearchCV(estimator=clf, param_grid=params, cv=5)
    model = clf.fit(X, y)
    predicted = model.predict(X)
    unique_elems = list(set(y))
    mapping = {"'dolly'":0, "'molly'":1}
    predicted_vals = [mapping[elem] for elem in predicted]
    y_vals = [mapping[elem] for elem in y]
    print(metrics.accuracy_score(y_vals, predicted_vals))

    
print("Ensemble")
model = grid.fit(X, y)
predicted = model.predict(X)
unique_elems = list(set(y))
mapping = {"'dolly'":0, "'molly'":1}
predicted_vals = [mapping[elem] for elem in predicted]
y_vals = [mapping[elem] for elem in y]
print(metrics.accuracy_score(y_vals, predicted_vals))


knn
0.6857
svm
0.6269
svm_gamma
0.785
rf
0.9763
Ensemble


Now let's look at some more ensemble methods.

In [31]:
from sklearn import ensemble
from sklearn import metrics

X = df[["a", "b", "c"]]
y = df["d"]

original_params = {'n_estimators': 1000, 'max_leaf_nodes': 17, 'max_depth': None, 'random_state': 2,
                   'min_samples_split': 5}

plt.figure()

for label, color, setting in [('No shrinkage', 'orange',
                               {'learning_rate': 1.0, 'subsample': 1.0}),
                              ('learning_rate=0.1', 'turquoise',
                               {'learning_rate': 0.1, 'subsample': 1.0}),
                              ('subsample=0.5', 'blue',
                               {'learning_rate': 1.0, 'subsample': 0.5}),
                              ('learning_rate=0.1, subsample=0.5', 'gray',
                               {'learning_rate': 0.1, 'subsample': 0.5}),
                              ('learning_rate=0.1, max_features=2', 'magenta',
                               {'learning_rate': 0.1, 'max_features': 2})]:
    params = dict(original_params)
    params.update(setting)

    clf = ensemble.GradientBoostingClassifier(**params)
    model = clf.fit(X, y)
    predicted = model.predict(X)
    unique_elems = list(set(y))
    mapping = {"'dolly'":0, "'molly'":1}
    predicted_vals = [mapping[elem] for elem in predicted]
    y_vals = [mapping[elem] for elem in y]
    print("GBT Setting:", setting)
    print(metrics.accuracy_score(y_vals, predicted_vals))    


GBT Setting: {'learning_rate': 1.0, 'subsample': 1.0}
1.0
GBT Setting: {'learning_rate': 0.1, 'subsample': 1.0}
0.9583
GBT Setting: {'learning_rate': 1.0, 'subsample': 0.5}
0.706
GBT Setting: {'learning_rate': 0.1, 'subsample': 0.5}
0.9298
GBT Setting: {'learning_rate': 0.1, 'max_features': 2}
0.9637


Finally, let's look at a neural network!  Our very first :)

In [None]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import math
from sklearn.metrics import confusion_matrix

# larger model
def create_network():
    # create model
    model = Sequential()
    model.add(Dense(35, input_dim=3, kernel_initializer='normal', activation='relu'))
    model.add(Dense(17, kernel_initializer='normal', activation='relu'))
    model.add(Dense(8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

X = df[["a", "b", "c"]]
y = df["d"]
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_network, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
#kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
model = pipeline.fit(X, y)
predicted = model.predict(X)
unique_elems = list(set(y))
mapping = {"'dolly'":0, "'molly'":1}
predicted_vals = [mapping[elem] for elem in predicted]
y_vals = [mapping[elem] for elem in y]
print("GBT Setting:", setting)
print(metrics.accuracy_score(y_vals, predicted_vals))   

The last thing we'll be looking at is an application of classification called text classification, where we create high level labels for text.  Below is an example - don't worry if it doesn't make sense!  

In [7]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

#http://scikit-learn.org/stable/modules/feature_extraction.html#customizing-the-vectorizer-classes
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

def train_classifier(text,labels):
    #http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#parameter-tuning-using-grid-search
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf__alpha': (1e-2, 1e-3),}
    
    text_clf = Pipeline([('vect', CountVectorizer(tokenizer=LemmaTokenizer())),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, n_iter=5, random_state=42)),])

    gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
    return gs_clf.fit(text,labels)
        
def classify_text(classifier,input_data):
    return classifier.predict([input_data])[0]

In [10]:
text = ["hello there", "hi there", "hello", "bye", "goodbye", "seeya"]
labels = ["greeting", "greeting", "greeting", "later", "later", "later"]
clf = train_classifier(text, labels)
classify_text(clf, "hi")

'greeting'