In [1]:
"""
Author      : Caleb Traxler 
Description : Machine Learning Homework Number 3 Coding Questions
"""

from string import punctuation

import numpy as np
import matplotlib.pyplot as plt
# !!! MAKE SURE TO USE LinearSVC.decision_function(X), NOT LinearSVC.predict(X) !!!
# (this makes ''continuous-valued'' predictions)
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix

# Problem 3: Twitter Analysis Using SVM

In [2]:
######################################################################
# functions -- input/output
######################################################################

def read_vector_file(fname):
    """
    Reads and returns a vector from a file.

    Parameters
    --------------------
        fname  -- string, filename

    Returns
    --------------------
        labels -- numpy array of shape (n,)
                    n is the number of non-blank lines in the text file
    """
    return np.genfromtxt(fname)


def write_label_answer(vec, outfile):
    """
    Writes your label vector to the given file.

    Parameters
    --------------------
        vec     -- numpy array of shape (n,) or (n,1), predicted scores
        outfile -- string, output filename
    """

    # for this project, you should predict 70 labels
    if(vec.shape[0] != 70):
        print("Error - output vector should have 70 rows.")
        print("Aborting write.")
        return

    np.savetxt(outfile, vec)
    

In [3]:
######################################################################
# functions -- feature extraction
######################################################################

def extract_words(input_string):
    """
    Processes the input_string, separating it into "words" based on the presence
    of spaces, and separating punctuation marks into their own words.

    Parameters
    --------------------
        input_string -- string of characters

    Returns
    --------------------
        words        -- list of lowercase "words"
    """

    for c in punctuation :
        input_string = input_string.replace(c, ' ' + c + ' ')
    return input_string.lower().split()


def extract_dictionary(infile):
    """
    Given a filename, reads the text file and builds a dictionary of unique
    words/punctuations.

    Parameters
    --------------------
        infile    -- string, filename

    Returns
    --------------------
        word_list -- dictionary, (key, value) pairs are (word, index)
    """

    word_list = {}
    idx = 0
    with open(infile, 'r') as fid :
        # process each line to populate word_list
        for input_string in fid:
            words = extract_words(input_string)
            for word in words:
                if word not in word_list:
                    word_list[word] = idx
                    idx += 1
    return word_list


def extract_feature_vectors(infile, word_list):
    """
    Produces a bag-of-words representation of a text file specified by the
    filename infile based on the dictionary word_list.

    Parameters
    --------------------
        infile         -- string, filename
        word_list      -- dictionary, (key, value) pairs are (word, index)

    Returns
    --------------------
        feature_matrix -- numpy array of shape (n,d)
                          boolean (0,1) array indicating word presence in a string
                            n is the number of non-blank lines in the text file
                            d is the number of unique words in the text file
    """

    num_lines = sum(1 for line in open(infile,'r'))
    num_words = len(word_list)
    feature_matrix = np.zeros((num_lines, num_words))

    with open(infile, 'r') as fid :
        # process each line to populate feature_matrix
        for i, input_string in enumerate(fid):
            words = extract_words(input_string)
            for word in words:
                feature_matrix[i, word_list[word]] = 1.0

    return feature_matrix

In [4]:
######################################################################
# functions -- evaluation
######################################################################

def performance(y_true, y_pred, metric="accuracy"):
    """
    Calculates the performance metric based on the agreement between the
    true labels and the predicted labels.

    Parameters
    --------------------
        y_true -- numpy array of shape (n,), known labels
        y_pred -- numpy array of shape (n,), (continuous-valued) predictions
        metric -- string, option used to select the performance measure
                  options: 'accuracy', 'f1-score', 'auroc', 'precision',
                           'sensitivity', 'specificity'

    Returns
    --------------------
        score  -- float, performance score
    """
    # map continuous-valued predictions to binary labels
    y_label = np.sign(y_pred)
    y_label[y_label==0] = 1

    ### ========== TODO : START ========== ###
    # part 1a: compute classifier performance
    
    if metric == "accuracy":
        return accuracy_score(y_true, y_label)
    elif metric == "f1-score":
        return f1_score(y_true, y_label)
    elif metric == "auroc":
        return roc_auc_score(y_true, y_pred)
    elif metric == "precision":
        return precision_score(y_true, y_label)
    elif metric == "sensitivity":
        return recall_score(y_true, y_label)
    elif metric == "specificity":
        tn, fp, fn, tp = confusion_matrix(y_true, y_label).ravel()
        return tn / float(tn+fp)
    else:
        raise ValueError("Unknown Metric.")
    
    pass
    ### ========== TODO : END ========== ###


def cv_performance(clf, X, y, kf, metric="accuracy"):
    """
    Splits the data, X and y, into k-folds and runs k-fold cross-validation.
    Trains classifier on k-1 folds and tests on the remaining fold.
    Calculates the k-fold cross-validation performance metric for classifier
    by averaging the performance across folds.

    Parameters
    --------------------
        clf    -- classifier (instance of LinearSVC)
        X      -- numpy array of shape (n,d), feature vectors
                    n = number of examples
                    d = number of features
        y      -- numpy array of shape (n,), binary labels {1,-1}
        kf     -- model_selection.StratifiedKFold
        metric -- string, option used to select performance measure

    Returns
    --------------------
        score   -- float, average cross-validation performance across k folds
    """

    ### ========== TODO : START ========== ###
    # part 1b: compute average cross-validation performance
    
    scores = []
    
    for train_index, test_index in kf.split(X,y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train, y_train)
        y_pred = clf.decision_function(X_test)
        score = performance(y_test, y_pred, metric)
        scores.append(score)
    
    return np.mean(scores)
                            
    
    pass
    ### ========== TODO : END ========== ###


def select_param_linear(X, y, kf, metric="accuracy"):
    """
    Sweeps different settings for the hyperparameter of a linear SVM,
    calculating the k-fold CV performance for each setting, then selecting the
    hyperparameter that 'maximize' the average k-fold CV performance.

    Parameters
    --------------------
        X      -- numpy array of shape (n,d), feature vectors
                    n = number of examples
                    d = number of features
        y      -- numpy array of shape (n,), binary labels {1,-1}
        kf     -- model_selection.StratifiedKFold
        metric -- string, option used to select performance measure

    Returns
    --------------------
        C -- float, optimal parameter value for linear SVM
    """

    print('Linear SVM Hyperparameter Selection based on ' + str(metric) + ':')
    C_range = 10.0 ** np.arange(-3, 3)
    best_score = -np.inf
    best_C = None

    ### ========== TODO : START ========== ###
    # part 1c: select optimal hyperparameter using cross-validation
    
    for C in C_range:
        clf = LinearSVC(loss='hinge', random_state = 0, C=C)
        score = cv_performance(clf, X, y, kf, metric)
        print(f"C: {C}, {metric}, {score}")
        
        if score > best_score:
            best_score = score
            best_C = C
    return best_C
            
    
    pass
    ### ========== TODO : END ========== ###


def performance_test(clf, X, y, metric="accuracy"):
    """
    Estimates the performance of the classifier.

    Parameters
    --------------------
        clf          -- classifier (instance of LinearSVC)
                          [already fit to data]
        X            -- numpy array of shape (n,d), feature vectors of test set
                          n = number of examples
                          d = number of features
        y            -- numpy array of shape (n,), binary labels {1,-1} of test set
        metric       -- string, option used to select performance measure

    Returns
    --------------------
        score        -- float, classifier performance
    """


    ### ========== TODO : START ========== ###
    # part 2b: return performance on test data under a metric.
    
    y_pred = clf.decision_function(X)
    
    score = performance(y, y_pred, metric)
    
    return score
    
    pass
    ### ========== TODO : END ========== ###

In [5]:
######################################################################
# main
######################################################################

def main() :
    np.random.seed(1234)

    # read the tweets and its labels, change the following two lines to your own path.
    ### ========== TODO : START ========== ###
    file_path = 'tweets.txt'
    label_path = 'labelFs.txt'
    ### ========== TODO : END ========== ###
    dictionary = extract_dictionary(file_path)
    print(len(dictionary))
    X = extract_feature_vectors(file_path, dictionary)
    y = read_vector_file(label_path)
    # split data into training (training + cross-validation) and testing set
    X_train, X_test = X[:560], X[560:]
    y_train, y_test = y[:560], y[560:]

    metric_list = ["accuracy", "f1-score", "auroc", "precision", "sensitivity", "specificity"]

    ### ========== TODO : START ========== ###
    # part 1b: create stratified folds (5-fold CV)

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=None)

    # part 1c: for each metric, select optimal hyperparameter for linear SVM using CV
    
    best_C_values = {}
    for metric in metric_list:
        best_C = select_param_linear(X_train, y_train, kf, metric)
        best_C_values[metric] = best_C
        print(f"Best C for {metric}: {best_C}")
    
    # part 2a: train linear SVMs with selected hyperparameters
    performances = {}
    for metric, best_C in best_C_values.items():
        clf = LinearSVC(loss='hinge', random_state=0, C=best_C)
        clf.fit(X_train, y_train)
    
    # part 2b: test the performance of your classifiers.
        y_pred = clf.decision_function(X_test)
        performance_score = performance(y_test, y_pred, metric)
        performances[metric] = performance_score
        print(f"Test performance for {metric}: {performance_score}")
    
    ### ========== TODO : END ========== ###


if __name__ == "__main__" :
    main()

1811
Linear SVM Hyperparameter Selection based on accuracy:
C: 0.001, accuracy, 0.7089285714285715
C: 0.01, accuracy, 0.7857142857142857
C: 0.1, accuracy, 0.8232142857142858
C: 1.0, accuracy, 0.8553571428571429
C: 10.0, accuracy, 0.8410714285714285
C: 100.0, accuracy, 0.8303571428571427
Best C for accuracy: 1.0
Linear SVM Hyperparameter Selection based on f1-score:
C: 0.001, f1-score, 0.8296684118673647
C: 0.01, f1-score, 0.8706029281089144
C: 0.1, f1-score, 0.8877170585088769
C: 1.0, f1-score, 0.8955839997529085
C: 10.0, f1-score, 0.8845909645909644
C: 100.0, f1-score, 0.8757878679811746
Best C for f1-score: 1.0
Linear SVM Hyperparameter Selection based on auroc:
C: 0.001, auroc, 0.6853714758342924
C: 0.01, auroc, 0.8452170538454162
C: 0.1, auroc, 0.8880539772727272
C: 1.0, auroc, 0.8940758055235903
C: 10.0, auroc, 0.8947513305523589
C: 100.0, auroc, 0.9013808676160338
Best C for auroc: 100.0
Linear SVM Hyperparameter Selection based on precision:
C: 0.001, precision, 0.70892857142857

# Problem 4: Boosting vs. Decision Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split

In [7]:
class Data :
    
    def __init__(self) :
        """
        Data class.
        
        Attributes
        --------------------
            X -- numpy array of shape (n,d), features
            y -- numpy array of shape (n,), targets
        """
                
        # n = number of examples, d = dimensionality
        self.X = None
        self.y = None
        
        self.Xnames = None
        self.yname = None
    
    def load(self, filename, header=0, predict_col=-1) :
        """Load csv file into X array of features and y array of labels."""
        
        # determine filename
        f = filename
        
        # load data
        with open(f, 'r') as fid :
            data = np.loadtxt(fid, delimiter=",", skiprows=header)
        
        # separate features and labels
        if predict_col is None :
            self.X = data[:,:]
            self.y = None
        else :
            if data.ndim > 1 :
                self.X = np.delete(data, predict_col, axis=1)
                self.y = data[:,predict_col]
            else :
                self.X = None
                self.y = data[:]
        
        # load feature and label names
        if header != 0:
            with open(f, 'r') as fid :
                header = fid.readline().rstrip().split(",")
                
            if predict_col is None :
                self.Xnames = header[:]
                self.yname = None
            else :
                if len(header) > 1 :
                    self.Xnames = np.delete(header, predict_col)
                    self.yname = header[predict_col]
                else :
                    self.Xnames = None
                    self.yname = header[0]
        else:
            self.Xnames = None
            self.yname = None


# helper functions
def load_data(filename, header=0, predict_col=-1) :
    """Load csv file into Data class."""
    data = Data()
    data.load(filename, header=header, predict_col=predict_col)
    return data

In [8]:
# Change the path to your own data directory
### ========== TODO : START ========== ###
titanic = load_data("titanic_train.csv", header=1, predict_col=0)
### ========== TODO : END ========== ###
X = titanic.X; Xnames = titanic.Xnames
y = titanic.y; yname = titanic.yname
n,d = X.shape  # n = number of examples, d =  number of features

In [9]:
def error(clf, X, y, ntrials=100, test_size=0.2) :
    """
    Computes the classifier error over a random split of the data,
    averaged over ntrials runs.

    Parameters
    --------------------
        clf         -- classifier
        X           -- numpy array of shape (n,d), features values
        y           -- numpy array of shape (n,), target classes
        ntrials     -- integer, number of trials
        test_size   -- proportion of data used for evaluation

    Returns
    --------------------
        train_error -- float, training error
        test_error  -- float, test error
    """

    train_error = 0
    test_error = 0

    train_scores = []; test_scores = [];
    for i in range(ntrials):
        xtrain, xtest, ytrain, ytest = train_test_split (X,y, test_size = test_size, random_state = i)
        clf.fit (xtrain, ytrain)

        ypred = clf.predict (xtrain)
        err = 1 - metrics.accuracy_score (ytrain, ypred, normalize = True)
        train_scores.append (err)

        ypred = clf.predict (xtest)
        err = 1 - metrics.accuracy_score (ytest, ypred, normalize = True)
        test_scores.append (err)

    train_error =  np.mean (train_scores)
    test_error = np.mean (test_scores)
    return train_error, test_error


In [10]:
### ========== TODO : START ========== ###
# Part 4(a): Implement the decision tree classifier and report the training error.
print('Classifying using Decision Tree...')
clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
clf.fit(X,y)
y_pred = clf.predict(X)
train_error = 1 - metrics.accuracy_score(y, y_pred)
print(f"Training error: {train_error}")

train_error, test_error = error(clf, X, y)
print(f"Average Test Error (over {100} trials): {test_error}")
### ========== TODO : END ========== ###

Classifying using Decision Tree...
Training error: 0.014044943820224698
Average Test Error (over 100 trials): 0.24104895104895108


In [11]:
### ========== TODO : START ========== ###
# Part 4(b): Implement the random forest classifier and adjust the number of samples used in bootstrap sampling.
print('Classifying using Random Forest...')
n = len(X)
sample_percentages = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

best_test_error = float('inf')
best_max_samples = None

for percentage in sample_percentages:
    max_samples = int(n * percentage)
    
    clf = RandomForestClassifier(criterion='entropy', random_state=0, max_samples=max_samples)
    train_error, test_error = error(clf, X, y)
    
    print(f"Max samples: {max_samples} (Percentage: {percentage}), Training Error: {train_error}, Test Error: {test_error}")
    
    if test_error < best_test_error:
          best_test_error = test_error
          best_max_samples = max_samples 
print(f"Best setting: max_samples = {best_max_samples}, with Test Error = {best_test_error}")
### ========== TODO : END ========== ###

Classifying using Random Forest...
Max samples: 71 (Percentage: 0.1), Training Error: 0.1357293497363796, Test Error: 0.19587412587412587
Max samples: 142 (Percentage: 0.2), Training Error: 0.10314586994727591, Test Error: 0.18797202797202794
Max samples: 213 (Percentage: 0.3), Training Error: 0.0818629173989455, Test Error: 0.18888111888111891
Max samples: 284 (Percentage: 0.4), Training Error: 0.05869947275922671, Test Error: 0.19216783216783218
Max samples: 356 (Percentage: 0.5), Training Error: 0.03388400702987697, Test Error: 0.19888111888111892
Max samples: 427 (Percentage: 0.6), Training Error: 0.017785588752196824, Test Error: 0.20111888111888113
Max samples: 498 (Percentage: 0.7), Training Error: 0.012390158172232001, Test Error: 0.20475524475524473
Max samples: 569 (Percentage: 0.8), Training Error: 0.011528998242530775, Test Error: 0.20671328671328676
Best setting: max_samples = 142, with Test Error = 0.18797202797202794


In [12]:
### ========== TODO : START ========== ###
# Part 4(c): Implement the random forest classifier and adjust the number of features for each decision tree.
print('Classifying using Random Forest...')

best_max_samples = 142

best_test_error = float('inf')
best_max_features = None

for max_features in range(1, 9):
    clf = RandomForestClassifier(criterion='entropy', random_state=0, max_samples=best_max_samples, max_features=max_features)
    train_error, test_error = error(clf, X,y)
    
    print(f"Max features: {max_features}, Training Error: {train_error}, Test Error: {test_error }")
    
    if test_error < best_test_error:
        best_test_error = test_error
        best_max_features = max_features 
print(f"Best setting: max_features = {best_max_features}, with Test Error = {best_test_error}")

### ========== TODO : END ========== ###

Classifying using Random Forest...
Max features: 1, Training Error: 0.10121265377855888, Test Error: 0.18776223776223777
Max features: 2, Training Error: 0.10314586994727591, Test Error: 0.18797202797202794
Max features: 3, Training Error: 0.10244288224956065, Test Error: 0.1872727272727273
Max features: 4, Training Error: 0.10430579964850617, Test Error: 0.1874125874125874
Max features: 5, Training Error: 0.10544815465729351, Test Error: 0.1886013986013986
Max features: 6, Training Error: 0.10581722319859402, Test Error: 0.189020979020979
Max features: 7, Training Error: 0.10776801405975397, Test Error: 0.18895104895104897
Max features: 8, Training Error: 0.10776801405975397, Test Error: 0.18895104895104897
Best setting: max_features = 3, with Test Error = 0.1872727272727273
