##### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2022 Semester 1

## Assignment 2: Sentiment Classification of Tweets | Model

This notebook contains code focused on selecting features for a best accuracy model.

**Full Name:** `Xavier Travers`

**Student ID:** `1178369`

First, import the data and the necessary python modules for the pipeline

In [1]:
import pandas as pd
from pprint import pprint
from time import time
import logging
from tempfile import mkdtemp
from shutil import rmtree
from scipy import sparse

# import the feature extractors
from features import *

# classifiers
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.dummy import DummyClassifier


# pipeline and parameter optimisation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# cross-validation
from sklearn.model_selection import cross_val_score

# import the data
train_data = pd.read_csv("../datasets/Train.csv", sep=',')
test_data = pd.read_csv("../datasets/Test.csv", sep=',')

# extract the training inputs and outputs
train_input = train_data[['text']].values[:, 0]
train_output = train_data[['sentiment']].values[:, 0]
test_input = test_data[['text']].values[:, 0]

### Classifier Container Object
This will contain classifiers (which can then be compared by `GridSearchCV`).

Reference: https://stackoverflow.com/questions/50285973/pipeline-multiple-classifiers

In [2]:
class ClassifierContainer(BaseEstimator):

    def __init__(
        self, 
        estimator = LogisticRegression(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 

        self.estimator = estimator

    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

    def cv_scores(self, X, y, cv=10): # returns a dictionary of average score and scores list
        scores = cross_val_score(self.estimator, X, y, cv = cv)
        return {
            'values': scores,
            'average': np.average(scores)
        }

### Baseline Model
The baseline model for this dataset will be `0R`

In [3]:
# create the baseline model
baseline = ClassifierContainer(DummyClassifier(strategy="most_frequent"))
baseline.fit(train_input, train_output)

# Check the baseline model's accuracy
pprint(baseline.cv_scores(train_input, train_output, cv = 10))

{'average': 0.5806348194793232,
 'values': array([0.58046768, 0.58046768, 0.58073394, 0.58073394, 0.58073394,
       0.58073394, 0.58073394, 0.58073394, 0.58073394, 0.58027523])}


### Features
The features can be aggregated or added one by one.

In [4]:
# The number of top TweetFeatures to filter through
WORDS_TOP_N = 2000
TOP_N = 100

# create the transformers
transformers = [
    TweetBagOfWords(top_n=WORDS_TOP_N), # Bag of words vectorizer
    # TweetTFIDF(top_n=WORDS_TOP_N), # TF-IDF vectorizer
    # TweetMetrics(), # Count metrics for tweets
    # TweetWordLengths(top_n=TOP_N, by_sentiment=True), # Word lengths
    # TweetCharacterFrequencies(top_n=TOP_N, by_sentiment=True, alphabetic=False), # Character distributions
    # TweetLinks(top_n=TOP_N, by_sentiment=True), # Links
    TweetHashtags(top_n=TOP_N), # Hashtags
    TweetReferences(top_n=TOP_N), # References
    TweetEmoticons(), # Emoticons
    # TweetPhonetics(top_n=TOP_N), # Raw phonetics
    # TweetPoetics(), # Poetic Phonetics
]

# perform the transforms and combine them into the input matrix
train_transforms = [tf.fit_transform(train_input, train_output) for tf in transformers]

# print the shapes of the transforms
for t in train_transforms:
    print(t.shape)

train_features = sparse.hstack(train_transforms)

# feature matrix information
print(train_features.shape)
print(type(train_features))

# do the same to test data
test_transforms = [tf.transform(test_input) for tf in transformers]

# print the shapes of the transforms
for t in test_transforms:
    print(t.shape)

test_features = sparse.hstack(test_transforms)

# feature matrix information
print(test_features.shape)
print(type(test_features))

# lets check out those dimensions and the 1st element
print(train_input.shape)

(21802, 2805)
(21802, 164)
(21802, 194)
(21802, 4)
(21802, 3167)
<class 'scipy.sparse._coo.coo_matrix'>
(21802, 2805)
(21802, 164)
(21802, 194)
(21802, 4)
(21802, 3167)
<class 'scipy.sparse._coo.coo_matrix'>
(21802,)


### Model Explorations
Here, different models are cross-validated to find the best one.

References: 
- https://scikit-learn.org/stable/modules/compose.html#pipeline
- https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html

In [5]:
# Generate variable choices for the model
MAX_ITER = [1000, 5000]
SVC_C = [10, 1, 1e-3, 1e-6]
KNN_N = [5, 10, 100]
DT_CRITERION = ['gini', 'entropy']
DT_MAX_DEPTH = [None, 1, 10, 20]
NB_ALPHAS = [1, 1e-3, 1e-6]
NB_FIT_PRIOR = [True, False]
LR_SOLVER = ['newton-cg', 'sag', 'saga', 'lbfgs']

# define the pipeline for parameter selection
cachedir = mkdtemp()
pipeline = Pipeline(
    [
        ('norm', 'passthrough'),
        ('clf', ClassifierContainer()), # allows switching between classifiers
    ],
    memory=cachedir
)

# add the parameters possible (to permutate over)
parameters = [
    { # CLF 1: Linear Support Vector Classifiers
        'norm': [StandardScaler(), 'passthrough'],
        'norm__with_mean': [False],
        'clf__estimator': [LinearSVC()],
        'clf__estimator__C': SVC_C,
        'clf__estimator__max_iter': MAX_ITER,
    },
    { # CLF 2: K Nearest-Neighbours
        'clf__estimator': [KNeighborsClassifier()],
        'clf__estimator__n_neighbors': KNN_N,
    },
    { # CLF 3: Decision Trees
        'clf__estimator': [DecisionTreeClassifier()],
        'clf__estimator__criterion': DT_CRITERION,
        'clf__estimator__max_depth': DT_MAX_DEPTH,
    },
    { # CLF 4: Gaussian Naive Bayes
        'clf__estimator': [MultinomialNB(), BernoulliNB()],
        'clf__estimator__alpha': NB_ALPHAS,
        'clf__estimator__fit_prior': NB_FIT_PRIOR,
    },
    { # CLF 5: Logistic Regression
        'clf__estimator': [LogisticRegression()],
        'clf__estimator__max_iter': MAX_ITER,
        'clf__estimator__solver': LR_SOLVER,
    },
]

# this next code needs to occur in a main block
if __name__ == "__main__":    
    # find the best classifier by grid search
    grid_search = GridSearchCV(pipeline, parameters, cv=10, verbose=3, error_score='raise')
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_features, train_output)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    pprint(best_parameters)

rmtree(cachedir)

Performing grid search...
pipeline: ['norm', 'clf']
parameters:
[{'clf__estimator': [LinearSVC()],
  'clf__estimator__C': [10, 1, 0.001, 1e-06],
  'clf__estimator__max_iter': [1000, 5000],
  'norm': [StandardScaler()],
  'norm__with_mean': [False]},
 {'clf__estimator': [KNeighborsClassifier()],
  'clf__estimator__n_neighbors': [5, 10, 100]},
 {'clf__estimator': [DecisionTreeClassifier()],
  'clf__estimator__criterion': ['gini', 'entropy'],
  'clf__estimator__max_depth': [None, 1, 10, 20]},
 {'clf__estimator': [MultinomialNB(), BernoulliNB()],
  'clf__estimator__alpha': [1, 0.001, 1e-06],
  'clf__estimator__fit_prior': [True, False]},
 {'clf__estimator': [LogisticRegression()],
  'clf__estimator__max_iter': [1000, 5000],
  'clf__estimator__solver': ['newton-cg', 'sag', 'saga', 'lbfgs']}]
Fitting 10 folds for each of 39 candidates, totalling 390 fits




[CV 1/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.560 total time=   2.1s




[CV 2/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.554 total time=   2.1s




[CV 3/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.566 total time=   2.1s




[CV 4/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.566 total time=   2.3s




[CV 5/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.568 total time=   2.1s




[CV 6/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.583 total time=   2.2s




[CV 7/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.577 total time=   2.1s




[CV 8/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.556 total time=   2.1s




[CV 9/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.566 total time=   2.1s




[CV 10/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.571 total time=   2.1s




[CV 1/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.611 total time=   9.7s




[CV 2/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.594 total time=   9.7s




[CV 3/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.586 total time=   9.8s




[CV 4/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.576 total time=   9.7s




[CV 5/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.574 total time=   9.7s




[CV 6/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.580 total time=   9.7s




[CV 7/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.600 total time=   9.9s




[CV 8/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.578 total time=   9.8s




[CV 9/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.603 total time=   9.6s




[CV 10/10] END clf__estimator=LinearSVC(), clf__estimator__C=10, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.583 total time=   9.7s




[CV 1/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.599 total time=   2.0s




[CV 2/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.610 total time=   2.0s




[CV 3/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.593 total time=   2.0s




[CV 4/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.598 total time=   2.0s




[CV 5/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.609 total time=   2.0s




[CV 6/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.594 total time=   2.0s




[CV 7/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.622 total time=   2.0s




[CV 8/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.594 total time=   2.0s




[CV 9/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.592 total time=   2.0s




[CV 10/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.606 total time=   2.0s




[CV 1/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.639 total time=   9.3s




[CV 2/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.626 total time=   9.4s




[CV 3/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.621 total time=   9.5s




[CV 4/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.612 total time=   9.4s




[CV 5/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.625 total time=   9.5s




[CV 6/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.628 total time=   9.4s




[CV 7/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.636 total time=   9.5s




[CV 8/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.619 total time=   9.3s




[CV 9/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.633 total time=   9.4s




[CV 10/10] END clf__estimator=LinearSVC(), clf__estimator__C=1, clf__estimator__max_iter=5000, norm=StandardScaler(), norm__with_mean=False;, score=0.609 total time=   9.3s
[CV 1/10] END clf__estimator=LinearSVC(), clf__estimator__C=0.001, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.647 total time=   0.4s
[CV 2/10] END clf__estimator=LinearSVC(), clf__estimator__C=0.001, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.636 total time=   0.4s
[CV 3/10] END clf__estimator=LinearSVC(), clf__estimator__C=0.001, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.637 total time=   0.3s
[CV 4/10] END clf__estimator=LinearSVC(), clf__estimator__C=0.001, clf__estimator__max_iter=1000, norm=StandardScaler(), norm__with_mean=False;, score=0.626 total time=   0.3s
[CV 5/10] END clf__estimator=LinearSVC(), clf__estimator__C=0.001, clf__estimator__max_iter=1000, norm=StandardScaler(), no