### Feature ablation + Experimental setup

#### Loading datasets

In [1]:
import pandas as pd
import numpy as np

training_df = pd.read_csv('data/train.txt', sep='\t', quotechar='~')
test_df = pd.read_csv('data/test.txt', sep='\t', quotechar='~')

training_true_labels = np.array(training_df['Label'])
training_tweets = np.array(training_df['Tweet text'])

test_true_labels = np.array(test_df['Label'])
test_tweets = np.array(test_df['Tweet text'])

#### Load features for training and test data

In [2]:
import sentiment_features
import semantical_features
import lexical_features
import syntactic_features

prep_training_tweets = lexical_features.preprocess(training_tweets)
prep_test_tweets = lexical_features.preprocess(test_tweets)
freq_list = lexical_features.get_frequencies(prep_training_tweets, 200)

train_lex_features = lexical_features.get_lexical_features(prep_training_tweets, freq_list)
train_syn_features = syntactic_features.get_syntactic_features(training_tweets)
train_sen_features = sentiment_features.get_sentiment_features(training_tweets)
train_sem_features = semantical_features.get_semantic_features(training_tweets)

test_lex_features = lexical_features.get_lexical_features(prep_test_tweets, freq_list)
test_syn_features = syntactic_features.get_syntactic_features(test_tweets)
test_sen_features = sentiment_features.get_sentiment_features(test_tweets)
test_sem_features = semantical_features.get_semantic_features(test_tweets)

train_comb_features = np.hstack((train_lex_features, train_syn_features, train_sen_features, train_sem_features))
test_comb_features = np.hstack((test_lex_features, test_syn_features, test_sen_features, test_sem_features))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pinda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Pinda\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Pinda\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Pinda\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Pinda\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pinda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-d

HBox(children=(FloatProgress(value=0.0, max=3834.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3834.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3834.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=784.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=784.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=784.0), HTML(value='')))




#### Building feature combinations (change this if you only want to run a subset of the features)

In [13]:
training_feature_dict = {'Syntactic': train_syn_features, 'Semantic': train_sem_features, 'Sentiment': train_sen_features, 'Lexical': train_lex_features, 'Combined': train_comb_features}
test_feature_dict = {'Syntactic': test_syn_features, 'Semantic': test_sem_features, 'Sentiment': test_sen_features, 'Lexical': test_lex_features, 'Combined': test_comb_features}

#### Generate parameter grid for grid search

In [4]:
power = -15
gamma_values = list()
while power <= 3:
    gamma = 2**power
    gamma_values.append(gamma)
    power += 2 
 
power = -5
c_values = list()
while power <= 15:
    c = 2**power
    c_values.append(c)
    power += 2
    
param_grid = {'C': c_values, 'gamma': gamma_values,'kernel': ['rbf']}

#### Run experiment

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from joblib import dump, load

scoring = {'Accuracy': 'accuracy', 'F1-score': 'f1', 'Recall': 'recall', 'Precision': 'precision'}

for key in training_feature_dict.keys():
    
    training_features = training_feature_dict[key]
    test_features = test_feature_dict[key]

    grid = GridSearchCV(SVC(), param_grid, scoring=scoring, refit='Accuracy', verbose=10)
    grid.fit(training_features, training_true_labels)
    dump(grid, key + '.joblib')

#### Gathering of results + evaluation using the pickled models

In [17]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tabulate import tabulate
from joblib import dump, load

keyset = ['Lexical', 'Syntactic', 'Sentiment', 'Semantic', 'Combined']
metrics = ['Accuracy', 'F1-score', 'Recall', 'Precision']
results = []


for key in keyset:
    model = load(key + '.joblib') 
    test_features = test_feature_dict[key]
    feature_vec_size = len(test_features[0])
    
    optimal_params = model.cv_results_['params'][model.best_index_]
    
    avg_metrics = []
    std_metrics = []
    for metric in metrics:
        avg_metric = model.cv_results_['mean_test_' + metric][model.best_index_]
        std_metric = model.cv_results_['std_test_' + metric][model.best_index_]
        avg_metrics.append(avg_metric)
        std_metrics.append(std_metric)
    
    predictions = model.best_estimator_.predict(test_features)
    test_accuracy = accuracy_score(test_true_labels, predictions)
    test_f1 = f1_score(test_true_labels, predictions)
    test_precision = precision_score(test_true_labels, predictions)
    test_recall = recall_score(test_true_labels, predictions)
    
    results.append([key, feature_vec_size, optimal_params['C'], optimal_params['gamma']] + avg_metrics + std_metrics + [test_accuracy, test_f1, test_precision, test_recall])

print(tabulate(results, headers=['Model', 'Feature vector size', 'Optimal C', 'Optimal gamma', 'Training avg Acc', 'Training avg F1', 'Training avg Prec', 'Training avg Recall', 'Training std Acc', 'Training std F1', 'Training std Prec', 'Training std Recall', 'Test Acc', 'Test F1', 'Test Prec', 'Test Recall'], tablefmt='orgtbl'))

for result in results:
    print(' & '.join(map(str,result)))

| Model     |   Feature vector size |   Optimal C |   Optimal gamma |   Training avg Acc |   Training avg F1 |   Training avg Prec |   Training avg Recall |   Training std Acc |   Training std F1 |   Training std Prec |   Training std Recall |   Test Acc |   Test F1 |   Test Prec |   Test Recall |
|-----------+-----------------------+-------------+-----------------+--------------------+-------------------+---------------------+-----------------------+--------------------+-------------------+---------------------+-----------------------+------------+-----------+-------------+---------------|
| Lexical   |                   808 |       512   |     3.05176e-05 |           0.64397  |          0.649707 |            0.662464 |              0.637703 |         0.0149951  |        0.0142495  |           0.0181897 |            0.0165071  |   0.65051  |  0.592262 |    0.551247 |      0.639871 |
| Syntactic |                    76 |         2   |     0.0078125   |           0.597283 |          0.6