### Syntactic Features

In [22]:
import spacy
nlp = spacy.load("en_core_web_sm")

def get_pos_and_ner(tweet):
    tweet = nlp(tweet)
    return ([(x.orth_, x.pos_,x.tag_, x.ent_type_) for x in [y for y in tweet if y.pos_ != 'SPACE']], tweet.ents) 

In [40]:
from collections import Counter

def get_syntactic_features(tweets):
    syn_features = [];

    for tweet in tweets:
        #List of all possible coarse pos-tags. 'Space' pos-tag not included
        tagset = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
        (pos_list,ents_list) = get_pos_and_ner(tweet)
        print(ents_list)
        num_tokens = len(pos_list)
        pos_map = Counter([pos for (_,pos,_,_) in pos_list])
        
        pos_features = []
        for tag in tagset:
            bin_feat = 1 if pos_map[tag] > 0 else 0
            bound_freq_feat = 2 if pos_map[tag] > 1 else pos_map[tag]
            unbound_freq_feat = pos_map[tag]
            perc_feat = pos_map[tag] / num_tokens
            
            pos_features.append(bin_feat)
            pos_features.append(bound_freq_feat)
            pos_features.append(unbound_freq_feat)
            pos_features.append(perc_feat)
            
        num_ents = len(ents_list)
        bin_ents = 1 if num_ents > 0 else 0
        num_tokens_ents = len([ent_type for (_,_,_,ent_type) in pos_list if ent_type != ''])
            
        ent_features = [bin_ents, num_ents, num_tokens_ents]
        
        syn_features.append(pos_features + ent_features)
    
    return syn_features