# Perceptron and Adaline ML Models Applied to the "superheroes-NLP-dataset"

In [11]:
import pandas as pd
import numpy as np

## Loading and looking at our dataset

I chose this set because I really like superheroes, I grew up reading all of my dad's old comics that we kept in a box under my bed. This dataset is also targeted specifically towards natural language processing, a topic which I'm really interested in learning more about!

Actually scratch all of that I bit off more than I could chew trying to use NLP with machine learning without experience with mapping text data to numeric features that could be used for ML.

Music has always been a passion in my life, and I found an interesting dataset that quantifies qualities of music from spotify and also classifies each song in terms of whether the dataset's creator likes or dislikes the song. I think it would be really interesting to do this myself with songs I like dislike.

In [12]:
music_data = pd.read_csv('spotify.csv')
music_data = music_data.sample(frac=1)

In [13]:
def every_unique_triplet(mylist):
    """returns every positionally unique triplet in a given list: assumes that no elements in the list appear more than once"""
    
    ret_list = []
    list_copy = mylist[1:].copy()
    list_copy_copy = list_copy[1:].copy()
    
    for i in mylist:
        for j in list_copy:
            for k in list_copy_copy:
                if j != k:
                    ret_list.append([i,j, k])
            list_copy_copy = list_copy[1:]
        list_copy = list_copy[1:]
   
    return ret_list

In [14]:
class AdalineSGD(object):
    """ADAptive LInear NEuron classifier."""
    
    # I added the keyword parameter threshold to allow the user to specify the threshold
    def __init__(self, learning_rate=0.01, epochs=10, shuffle=True, random_seed=None, threshold=0):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights_initialized = False
        self.shuffle = shuffle
        self.random_seed = random_seed
        self.threshold = threshold
        
    def fit(self, X, y):
        """"""
        self._initialize_weights(X.shape[1])
        self.cost_ = []
        for i in range(self.epochs):
            if self.shuffle:
                X, y = self._shuffle(X, y)
            cost = []
            for xi, target in zip(X, y):
                cost.append(self._update_weights(xi, target))
            avg_cost = sum(cost) / len(y)
            self.cost_.append(avg_cost)
        return self

    def partial_fit(self, X, y):
        """Fit training data without reinitializing the weights"""
        if not self.weights_initialized:
            self._initialize_weights(X.shape[1])
        if y.ravel().shape[0] > 1:
            for xi, target in zip(X, y):
                self._update_weights(xi, target)
        else:
            self._update_weights(X, y)
        return self

    def _shuffle(self, X, y):
        """Shuffle training data"""
        r = self.rgen.permutation(len(y))
        return X[r], y[r]
    
    def _initialize_weights(self, m):
        """Initialize weights to small random numbers"""
        self.rgen = np.random.RandomState(self.random_seed)
        self.weights = self.rgen.normal(loc=0.0, scale=0.01, size=1 + m)
        self.weights_initialized = True
        
    def _update_weights(self, xi, target):
        """Apply Adaline learning rule to update the weights"""
        output = self.activation(self.net_input(xi))
        error = (target - output)
        self.weights[1:] += self.learning_rate * xi.dot(error)
        self.weights[0] += self.learning_rate * error
        cost = 0.5 * error**2
        return cost
    
    def net_input(self, X):
        """Calculate net input"""
        return np.dot(X, self.weights[1:]) + self.weights[0]

    def activation(self, X):
        """Compute linear activation"""
        return X

    def predict(self, X):
        """Return class label after unit step"""
        return np.where(self.activation(self.net_input(X)) >= self.threshold, 1, -1)

In [15]:
def accuracy_and_misclasses(prediction, labels):
    """Fn to determine accuracy"""
    missclassifications = 0
    correct_predictions = len(labels)
    for a,b in zip(prediction, labels):
        if a != b:
            missclassifications += 1
            correct_predictions -= 1
    return (correct_predictions / len(labels), missclassifications)

In [16]:
def split_fit_test(c1, c2, c3, testtrain_ratio, dataframe=music_data, verbose=False, learning_rate=0.1, epochs=50, threshold=0):
    """split data from feature columns c1 and c2 into train and test sets at tt_ratio proportions and fit/test a perceptron"""
    
    # get the integer indeces corresponding to the column names passed to split_fit_test
    c1_idx = dataframe.columns.get_loc(c1)
    c2_idx = dataframe.columns.get_loc(c2)
    c3_idx = dataframe.columns.get_loc(c2)
    
    # number of rows of dataframe which will belong to the training set (we know the number in the test set from this implicitly)
    num_train = len(dataframe.index) - int(len(dataframe.index) * testtrain_ratio)
    
    # Training set
    y_train = dataframe.iloc[:num_train,14].values # the array of target values: 2 for benign, 4 for malignant
    y_train = np.where(y_train == 1, 1, -1) # change class labels 2 and 4 to -1 and 1 respectively
    X_train = dataframe.iloc[:num_train, [c1_idx,c2_idx,c3_idx]].values
    
    # feature scaling to standardize the distribution of values in our training set
    X_train_std = np.copy(X_train)
    X_train_std[:, 0] = (X_train[:, 0] - X_train[:, 0].mean()) / X_train[:, 0].std()
    X_train_std[:, 1] = (X_train[:, 1] - X_train[:, 1].mean()) / X_train[:, 1].std()
    X_train_std[:, 2] = (X_train[:, 2] - X_train[:, 2].mean()) / X_train[:, 2].std()
    
    # Testing set
    y_test = dataframe.iloc[num_train:,14].values # analagous to above
    y_test = np.where(y_test == 1, 1, -1)
    X_test = dataframe.iloc[num_train:, [c1_idx, c2_idx, c3_idx]].values
    X_test_std = np.copy(X_train)
    X_test_std[:, 0] = (X_train[:, 0] - X_train[:, 0].mean()) / X_train[:, 0].std()
    X_test_std[:, 1] = (X_train[:, 1] - X_train[:, 1].mean()) / X_train[:, 1].std()
    X_test_std[:, 2] = (X_train[:, 2] - X_train[:, 2].mean()) / X_train[:, 2].std()
    
    tron = perceptron(learning_rate=learning_rate, epochs=epochs, threshold=threshold)
    tron.fit(X_train_std, y_train)

    prediction = tron.predict(X_test_std)
    accuracy,misclasses = accuracy_and_misclasses(prediction, y_test)
    if verbose:
        print("For features", c1, ",", c2, "and", c3, ", and test/train ratio", testtrain_ratio, "the perceptron had", misclasses, "missclassifications and had an accuracy of", accuracy, "\n")
        
    return (accuracy, misclasses)

I ran a brute-force loop to check every possible set of features for their accuracy (it ran for about 10 minutes)
The results were this:
       The highest accuracy was 0.5553719008264463 for the feature set danceability , mode and key with 269 
       missclassifications.

In [17]:
split_fit_test('danceability', 'duration_ms', 'time_signature', 0.3, verbose=True)

For features danceability , duration_ms and time_signature , and test/train ratio 0.3 the perceptron had 286 missclassifications and had an accuracy of 0.5272727272727272 



(0.5272727272727272, 286)

In [18]:
best_accuracy = 0
misses = 0
best_prop = 0
 
for prop in [0.25, 0.3, 0.35, 0.40, 0.45]:
    acc,miss = split_fit_test('danceability', 'duration_ms', 'time_signature', prop, verbose=True)
    if acc > best_accuracy:
        best_accuracy = acc
        misses = miss
        best_prop = prop
        
        
print("The highest accuracy was", best_accuracy, "for test/train proportion", best_prop, "with", misses, "missclassifications.")

For features danceability , duration_ms and time_signature , and test/train ratio 0.25 the perceptron had 267 missclassifications and had an accuracy of 0.47023809523809523 

For features danceability , duration_ms and time_signature , and test/train ratio 0.3 the perceptron had 286 missclassifications and had an accuracy of 0.5272727272727272 

For features danceability , duration_ms and time_signature , and test/train ratio 0.35 the perceptron had 319 missclassifications and had an accuracy of 0.5475177304964539 

For features danceability , duration_ms and time_signature , and test/train ratio 0.4 the perceptron had 400 missclassifications and had an accuracy of 0.5037220843672456 

For features danceability , duration_ms and time_signature , and test/train ratio 0.45 the perceptron had 465 missclassifications and had an accuracy of 0.48732083792723263 

The highest accuracy was 0.5475177304964539 for test/train proportion 0.35 with 319 missclassifications.


In [20]:
best_accuracy = 0
misses = 0
best_rate = 0

for rate in [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4]:
    acc,miss = split_fit_test('danceability', 'duration_ms', 'time_signature', 0.35, learning_rate=rate)
    if acc > best_accuracy:
        best_accuracy = acc
        misses = miss
        best_rate = rate

print("The highest accuracy was", best_accuracy, "for learning rate", best_rate, "with", misses, "missclassifications.")

The highest accuracy was 0.5531914893617021 for learning rate 0.0001 with 315 missclassifications.


In [22]:
best_accuracy = 0
misses = 0
best_num_epochs = 0

for n in [10, 20, 30, 40, 50, 75, 100, 200]:
    acc,miss = split_fit_test('danceability', 'duration_ms', 'time_signature', 0.35, learning_rate=0.0001, epochs=n)
    if acc > best_accuracy:
        best_accuracy = acc
        misses = miss
        best_num_epochs = n

print("The highest accuracy was", best_accuracy, "for", best_num_epochs, "epochs with", misses, "missclassifications.")

The highest accuracy was 0.5546099290780142 for 75 epochs with 314 missclassifications.


In [23]:
best_accuracy = 0
misses = 0
best_threshold = 0
for theta in [0, 0.1, 0.01, 0.2, 0.5, 1, 2, -1, -2, 3, 4, 6]:
    acc,miss = split_fit_test('danceability', 'duration_ms', 'time_signature', 0.35, learning_rate=0.0001, epochs=75, threshold=theta)
    if acc > best_accuracy:
        best_accuracy = acc
        misses = miss
        best_threshold = theta
        
print("The highest accuracy was", best_accuracy, "for the threshold", best_threshold, "with", misses, "missclassifications.")

The highest accuracy was 0.5560283687943263 for the threshold 0.01 with 313 missclassifications.
