##### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2022 Semester 1

## Assignment 2: Sentiment Classification of Tweets | Model

This notebook contains code focused on selecting features for a best accuracy model.

**Full Name:** `Xavier Travers`

**Student ID:** `1178369`

First, import the data and the necessary python modules for the pipeline

In [1]:
import pandas as pd
from pprint import pprint
from time import time
import logging
from tempfile import mkdtemp
from shutil import rmtree
from scipy import sparse

# import the feature extractors
from features import *
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# classifiers
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, CategoricalNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.dummy import DummyClassifier

# pipeline and parameter optimisation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# cross-validation
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score

# import the data
train_data = pd.read_csv("../datasets/Train.csv", sep=',')
test_data = pd.read_csv("../datasets/Test.csv", sep=',')

# extract the training inputs and outputs
train_input = train_data[['text']].values[:, 0]
train_output = train_data[['sentiment']].values[:, 0]
test_input = test_data[['text']].values[:, 0]

[nltk_data] Downloading package stopwords to C:\Users\Xavier
[nltk_data]     Travers\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Classifier Container Object
This will contain classifiers (which can then be compared by `GridSearchCV`).

Reference: https://stackoverflow.com/questions/50285973/pipeline-multiple-classifiers

In [2]:
class ClassifierContainer(BaseEstimator):

    def __init__(
        self, 
        estimator = LogisticRegression(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 

        self.estimator = estimator

    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

    def cv_scores(self, X, y, cv=10): # returns a dictionary of average score and scores list
        scores = cross_val_score(self.estimator, X, y, cv = cv)
        return {
            'values': scores,
            'average': np.average(scores)
        }

### Baseline Model
The baseline model for this dataset will be `0R`

In [3]:
# create the baseline model
baseline = ClassifierContainer(DummyClassifier(strategy="most_frequent"))
baseline.fit(train_input, train_output)

# Check the baseline model's accuracy
pprint(baseline.cv_scores(train_input, train_output, cv = 10))

{'average': 0.5806348194793232,
 'values': array([0.58046768, 0.58046768, 0.58073394, 0.58073394, 0.58073394,
       0.58073394, 0.58073394, 0.58073394, 0.58073394, 0.58027523])}


### Features
The features can be aggregated or added one by one.

In [4]:
# The number of top TweetFeatures to filter through
BOW_MAX_FEATURES = 5000
TFIDF_MAX_FEATURES = BOW_MAX_FEATURES
MAX_FEATURES = 1000

# create the transformers
transformers = [
    TweetBagOfWords(max_features=BOW_MAX_FEATURES), # Bag of words vectorizer
    # TweetTFIDF(max_features=TFIDF_MAX_FEATURES), # TF-IDF vectorizer
    # TweetMetrics(), # Count metrics for tweets
    # TweetWordLengths(max_features=MAX_FEATURES), # Word lengths
    # TweetCharacterFrequencies(max_features=MAX_FEATURES, alphabetic=False), # Character distributions
    # TweetLinks(max_features=MAX_FEATURES), # Links
    TweetHashtags(max_features=MAX_FEATURES), # Hashtags
    TweetReferences(max_features=MAX_FEATURES), # References
    TweetEmoticons(), # Emoticons
    TweetPhonetics(max_features=MAX_FEATURES), # Raw phonetics
    # TweetPoetics(), # Poetic Phonetics
]

# perform the transforms and combine them into the input matrix
train_transforms = [tf.fit_transform(train_input, train_output) for tf in transformers]

# print the shapes of the transforms
for t in train_transforms:
    print(t.shape)

train_features = None
if len(train_transforms) == 1:
    train_features = train_transforms[0]
else:
    train_features = sparse.hstack(train_transforms)

# feature matrix information
print(train_features.shape)
print(type(train_features))

# do the same to test data
test_transforms = [tf.transform(test_input) for tf in transformers]

# print the shapes of the transforms
for t in test_transforms:
    print(t.shape)

test_features = None
if len(test_transforms) == 1:
    test_features = test_transforms[0]
else:
    test_features = sparse.hstack(test_transforms)

# feature matrix information
print(test_features.shape)
print(type(test_features))

(21802, 5000)
(21802, 1000)
(21802, 1000)
(21802, 4)
(21802, 72)
(21802, 7076)
<class 'scipy.sparse._coo.coo_matrix'>
(6099, 5000)
(6099, 1000)
(6099, 1000)
(6099, 4)
(6099, 72)
(6099, 7076)
<class 'scipy.sparse._coo.coo_matrix'>


In [8]:
gnb_accs = []
mnb_accs = []
bnb_accs = []
gnb = GaussianNB()
mnb = MultinomialNB(fit_prior=True)
bnb = BernoulliNB(fit_prior=True)

nb_classifiers = {'mnb': mnb, 'bnb': bnb}
nb_accs = {}
nb_avgs = {}

for nb in nb_classifiers.keys():
    accs = cross_val_score(nb_classifiers[nb], train_features, train_output, cv = 10)
    avg = np.mean(accs)
    print(f"Average {nb} score: {avg:.3f}")
    print(f"[{accs}]")
    nb_accs[nb] = accs
    nb_avgs[nb] = avg

# print('Avg GNB score: {}'.format(np.mean(gnb_accs)))
# print('Avg MNB score: {}'.format(np.mean(mnb_accs)))
# print('Avg BNB score: {}'.format(np.mean(bnb_accs)))

bnb.fit(train_features, train_output)
y_pred = bnb.predict(test_features)
print(y_pred[:5])
predictions1 = pd.DataFrame({'id': test_data['id'], 'sentiment': y_pred})
predictions1.to_csv('./pred-bnb-fp-1.csv', index = False)

Average mnb score: 0.612
[[0.62952774 0.6258597  0.6353211  0.61055046 0.61651376 0.62155963
 0.61880734 0.56743119 0.58899083 0.60366972]]
Average bnb score: 0.621
[[0.63548831 0.63594681 0.64770642 0.62522936 0.6233945  0.63256881
 0.64724771 0.5587156  0.59678899 0.60917431]]
['negative' 'negative' 'neutral' 'neutral' 'positive']


### Model Explorations
Here, different models are cross-validated to find the best one.

References: 
- https://scikit-learn.org/stable/modules/compose.html#pipeline
- https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html

In [7]:
# Generate variable choices for the model
MAX_ITER = [1000]
SVC_C = [10, 1, 1e-3, 1e-6]
SVC_KERNEL = ['rbf', 'sigmoid']
SVC_POLY_DEG = [3, 5, 10]
KNN_N = [5, 10, 100, 1000, 5000]
DT_CRITERION = ['gini', 'entropy']
DT_MAX_DEPTH = [None, 1, 10, 1000]
NB_ALPHAS = [10**pw for pw in range(-8, 6, 2)]
NB_FIT_PRIOR = [True, False]
LR_SOLVER = ['newton-cg', 'sag', 'saga', 'lbfgs']
LR_C = [2**pw for pw in range(0, 4)]

# define the pipeline for parameter selection
cachedir = mkdtemp() 
pipeline = Pipeline(
    [
        ('norm', 'passthrough'),
        ('clf', ClassifierContainer()), # allows switching between classifiers
    ],
    memory=cachedir
)

# add the parameters possible (to permutate over)
parameters = [
    { # CLF: Linear Support Vector Classifiers
        'norm': [StandardScaler(with_mean=False)],
        'clf__estimator': [LinearSVC()],
        'clf__estimator__C': SVC_C,
        'clf__estimator__max_iter': MAX_ITER,
    },
    { # CLF: Support Vector Classifiers (no poly here since it's check below)
        'norm': [StandardScaler(with_mean=False)],
        'clf__estimator': [SVC()],
        'clf__estimator__kernel': SVC_KERNEL,
        'clf__estimator__C': SVC_C,
        'clf__estimator__max_iter': MAX_ITER,
    },
    { # CLF: Support Vector Polynomial Classifiers (iterates over degrees)
        'norm': [StandardScaler(with_mean=False)],
        'clf__estimator': [SVC()],
        'clf__estimator__kernel': ['poly'],
        'clf__estimator__degree': SVC_POLY_DEG,
        'clf__estimator__C': SVC_C,
        'clf__estimator__max_iter': MAX_ITER,
    },
    { # CLF: K Nearest-Neighbours
        'clf__estimator': [KNeighborsClassifier()],
        'clf__estimator__n_neighbors': KNN_N,
    },
    { # CLF: Decision Trees
        'clf__estimator': [DecisionTreeClassifier()],
        'clf__estimator__criterion': DT_CRITERION,
        'clf__estimator__max_depth': DT_MAX_DEPTH,
    },
    { # CLF: Multinomial and Bernoulli Naive Bayes
        'clf__estimator': [MultinomialNB(), BernoulliNB()],
        'clf__estimator__alpha': NB_ALPHAS,
        'clf__estimator__fit_prior': NB_FIT_PRIOR,
    },
    { # CLF: Logistic Regression
        'clf__estimator': [LogisticRegression()],
        'clf__estimator__C': LR_C,
        'clf__estimator__max_iter': MAX_ITER,
        'clf__estimator__solver': LR_SOLVER,
    },
]

# this next code needs to occur in a main block
if __name__ == "__main__":    
    # find the best classifier by grid search
    verbosity = 1
    grid_search = GridSearchCV(pipeline, parameters, 
        cv=StratifiedKFold(random_state=101, shuffle=True), 
        verbose=verbosity,
        n_jobs = -1)
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_features, train_output)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    pprint(best_parameters)

    y_pred = grid_search.predict(test_features)
    print(y_pred[:5])
    predictions1 = pd.DataFrame({'id': test_data['id'], 'sentiment': y_pred})
    predictions1.to_csv('./pred-2.csv', index = False)

rmtree(cachedir)

Performing grid search...
pipeline: ['norm', 'clf']
parameters:
[{'clf__estimator': [LinearSVC()],
  'clf__estimator__C': [10, 1, 0.001, 1e-06],
  'clf__estimator__max_iter': [1000],
  'norm': [StandardScaler(with_mean=False)]},
 {'clf__estimator': [SVC()],
  'clf__estimator__C': [10, 1, 0.001, 1e-06],
  'clf__estimator__kernel': ['rbf', 'sigmoid'],
  'clf__estimator__max_iter': [1000],
  'norm': [StandardScaler(with_mean=False)]},
 {'clf__estimator': [SVC()],
  'clf__estimator__C': [10, 1, 0.001, 1e-06],
  'clf__estimator__degree': [3, 5, 10],
  'clf__estimator__kernel': ['poly'],
  'clf__estimator__max_iter': [1000],
  'norm': [StandardScaler(with_mean=False)]},
 {'clf__estimator': [KNeighborsClassifier()],
  'clf__estimator__n_neighbors': [5, 10, 100, 1000, 5000]},
 {'clf__estimator': [DecisionTreeClassifier()],
  'clf__estimator__criterion': ['gini', 'entropy'],
  'clf__estimator__max_depth': [None, 1, 10, 1000]},
 {'clf__estimator': [MultinomialNB(), BernoulliNB()],
  'clf__estima

