### Syntactic Features

In [None]:
from collections import Counter

def get_syntactic_features(tweets):
    syn_features = [];

    for tweet in tweets:
        #List of all possible coarse pos-tags. 'Space' pos-tag not included
        tagset = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
        (pos_list,ents_list) = get_pos_and_ner(tweet)
        num_tokens = len(pos_list)
        pos_map = Counter([pos for (_,pos,_,_) in pos_list])
        
        pos_features = []
        for tag in tagset:
            bin_feat = 1 if pos_map[tag] > 0 else 0
            bound_freq_feat = 2 if pos_map[tag] > 1 else pos_map[tag]
            unbound_freq_feat = pos_map[tag]
            perc_feat = pos_map[tag] / num_tokens
            
            pos_features.append(bin_feat)
            pos_features.append(bound_freq_feat)
            pos_features.append(unbound_freq_feat)
            pos_features.append(perc_feat)
            
            
        #verb_tenses = [pos for (_,pos,_) in pos_list]]
            
        num_ents = len(ents_list)
        bin_ents = 1 if num_ents > 0 else 0
        num_tokens_ents = len([ent_type for (_,_,_,ent_type) in pos_list])
        #freq_tokens_ents = no clue
            
        ent_features = [bin_ents, num_ents, num_tokens_ents]
        
        syn_features.append(pos_features + ent_features)
    
    return syn_features

### Feature ablation + Experimental setup

#### Loading datasets

In [None]:
import pandas as pd

training_df = pd.read_csv('data/training.txt', sep='\t', quotechar='~')
test_df = pd.read_csv('data/test.txt', sep='\t', quotechar='~')

training_true_labels = np.array(training_df['Label'])
training_tweets = np.array(training_df['Tweet text'])

test_true_labels = np.array(test_df['Label'])
test_tweets = np.array(test_df['Tweet text'])

#### Load features for training and test data

In [None]:
import Semantical_features

#train_lex_features =
train_syn_features = get_syntactic_features(training_tweets)
#train_sen_features =
train_sem_feature = get_semantical_features(training_tweets)

len(train_syn_features)

#test_lex_features =
test_syn_features = get_syntactic_features(test_tweets)
#test_sen_features =
test_sem_feature = get_semantical_features(test_tweets)

training_feature_dict = {'Syntactic': train_syn_features}
test_feature_dict = {'Syntactic': test_syn_features}

#### Generate parameter grid for grid search

In [None]:
power = -15
gamma_values = list()
while power <= 3:
    gamma = 2**power
    gamma_values.append(gamma)
    power += 2 
 
power = -5
c_values = list()
while power <= 15:
    c = 2**power
    c_values.append(c)
    power += 2
    
param_grid = {'C': c_values, 'gamma': gamma_values,'kernel': ['rbf']}

#### Run experiment

In [None]:
from sklearn.datasets import *
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

scoring = {'Accuracy': 'accuracy', 'F1-score': 'f1', 'Recall': 'recall', 'Precision': 'precision'}

for key in training_feature_dict.keys():
    
    training_features = training_feature_dict[key]
    test_features = test_feature_dict[key]

    grid = GridSearchCV(SVC(), param_grid, scoring=scoring, refit='Accuracy', verbose=10)
    grid.fit(training_features, training_true_labels)

In [None]:
from sklearn.metrics import accuracy_score

predictions = grid.best_estimator_.predict(test_features)
len(predictions)

accuracy_score(test_true_labels, predictions)

In [None]:
from sklearn.metrics import accuracy_score

predictions = grid.best_estimator_.predict(test_features)
len(predictions)

accuracy_score(test_true_labels, predictions)

In [None]:
from collections import Counter

def get_syntactic_features(tweets):
    syn_features = [];

    for tweet in tweets:
        #List of all possible coarse pos-tags. 'Space' pos-tag not included
        tagset = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
        (pos_list,ents_list) = get_pos_and_ner(tweet)
        num_tokens = len(pos_list)
        pos_map = Counter([pos for (_,pos,_,_) in pos_list])
        
        pos_features = []
        for tag in tagset:
            bin_feat = 1 if pos_map[tag] > 0 else 0
            bound_freq_feat = 2 if pos_map[tag] > 1 else pos_map[tag]
            unbound_freq_feat = pos_map[tag]
            perc_feat = pos_map[tag] / num_tokens
            
            pos_features.append(bin_feat)
            pos_features.append(bound_freq_feat)
            pos_features.append(unbound_freq_feat)
            pos_features.append(perc_feat)
            
            
        #verb_tenses = [pos for (_,pos,_) in pos_list]]
            
        num_ents = len(ents_list)
        bin_ents = 1 if num_ents > 0 else 0
        num_tokens_ents = len([ent_type for (_,_,_,ent_type) in pos_list])
        #freq_tokens_ents = no clue
            
        ent_features = [bin_ents, num_ents, num_tokens_ents]
        
        syn_features.append(pos_features + ent_features)
    
    return syn_features

### Feature ablation + Experimental setup

#### Loading datasets

In [None]:
import pandas as pd

training_df = pd.read_csv('data/training.txt', sep='\t', quotechar='~')
test_df = pd.read_csv('data/test.txt', sep='\t', quotechar='~')

training_true_labels = np.array(training_df['Label'])
training_tweets = np.array(training_df['Tweet text'])

test_true_labels = np.array(test_df['Label'])
test_tweets = np.array(test_df['Tweet text'])

#### Load features for training and test data

In [None]:
import Semantical_features

#train_lex_features =
train_syn_features = get_syntactic_features(training_tweets)
#train_sen_features =
train_sem_feature = get_semantical_features(training_tweets)

len(train_syn_features)

#test_lex_features =
test_syn_features = get_syntactic_features(test_tweets)
#test_sen_features =
test_sem_feature = get_semantical_features(test_tweets)

training_feature_dict = {'Syntactic': train_syn_features}
test_feature_dict = {'Syntactic': test_syn_features}

#### Generate parameter grid for grid search

In [None]:
power = -15
gamma_values = list()
while power <= 3:
    gamma = 2**power
    gamma_values.append(gamma)
    power += 2 
 
power = -5
c_values = list()
while power <= 15:
    c = 2**power
    c_values.append(c)
    power += 2
    
param_grid = {'C': c_values, 'gamma': gamma_values,'kernel': ['rbf']}

#### Run experiment

In [None]:
from sklearn.datasets import *
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

scoring = {'Accuracy': 'accuracy', 'F1-score': 'f1', 'Recall': 'recall', 'Precision': 'precision'}

for key in training_feature_dict.keys():
    
    training_features = training_feature_dict[key]
    test_features = test_feature_dict[key]

    grid = GridSearchCV(SVC(), param_grid, scoring=scoring, refit='Accuracy', verbose=10)
    grid.fit(training_features, training_true_labels)

In [None]:
from sklearn.metrics import accuracy_score

predictions = grid.best_estimator_.predict(test_features)
len(predictions)

accuracy_score(test_true_labels, predictions)

In [7]:
from sklearn.metrics import accuracy_score

predictions = grid.best_estimator_.predict(test_features)
len(predictions)

accuracy_score(test_true_labels, predictions)

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])