# Sample Authors and Prepare Feeds for Feature Extraction

In [5]:
from feature_extraction_functions import *
team_seed = 13 + 4 + 5
random.seed(team_seed) 

num_native_authors_to_sample = 15
num_nonnat_authors_to_sample = 15

# Space-tokenized, long-chain version of feeds (Arthur's version + some cleaning and wide-to-long conversion)
eng_nonnat = pd.read_pickle('Data/Classified/non_native_english_40feeds')
eng_nat = pd.read_pickle('Data/Classified/native_english_40feeds')
eng_nonnat['proficiency'] = "N" # native
eng_nat['proficiency'] = "L" # learner
# if we want a more uniform timerange, we can keep if timerange < x here
eng_nonnat_sample = eng_nonnat.sample(num_nonnat_authors_to_sample, random_state=team_seed)
eng_native_sample = eng_nat.sample(num_native_authors_to_sample, random_state=team_seed)
eng_feeds = pd.concat([eng_nonnat_sample, eng_native_sample], ignore_index=False, axis=0)
eng_feeds['author'] = eng_feeds.index
eng_feeds = pd.wide_to_long(eng_feeds, ["file", "slices"], i="author", j="intra_author_feed_id").sort_index()
eng_feeds = eng_feeds.rename(columns={"slices": "comment_lengths", "file": "feed_tokens_space"})

# String version of feeds
def feed_string(feed):
    return ' '.join(feed)
eng_feeds['feed_string'] = eng_feeds['feed_tokens_space'].apply(feed_string)

# List-of-comments version of feeds
## Doing this via loop because two columns involved in function instead of one...
def create_comment_word_indices(comment_lengths):
    np_comment_lengths = np.array(comment_lengths)
    return np.cumsum(np_comment_lengths)
eng_feeds['comment_word_indices'] = eng_feeds['comment_lengths'].apply(create_comment_word_indices)

eng_feeds['feed_comment_list'] = ""
for index, row in eng_feeds.iterrows():
    comm_w_indices_temp = row['comment_word_indices']
    feed_tokens_space_temp = row['feed_tokens_space']
    inner_list = []
    for i in range(len(comm_w_indices_temp)):
        if i == 0:
            inner_list.append(feed_tokens_space_temp[0:comm_w_indices_temp[i]])
        else:
            inner_list.append(feed_tokens_space_temp[comm_w_indices_temp[i-1]:comm_w_indices_temp[i]])
    eng_feeds.at[index,'feed_comment_list'] = inner_list
eng_feeds = eng_feeds.drop('comment_word_indices', axis=1)

# List-of-comments w/ punctuation stripped and lowercase applied version of feeds
def strip_punc_and_lower_nested_list(feed_comment_list):
    feed_comment_list_nopunc_lower = []
    for comment in feed_comment_list:
        feed_comment_list_nopunc_lower.append(re.sub(r'[^A-Za-z0-9 ]+', '', ' '.join(comment)).lower().split()) 
    return feed_comment_list_nopunc_lower
eng_feeds['feed_comment_list_nopunc_lower'] = eng_feeds['feed_comment_list'].apply(strip_punc_and_lower_nested_list)

# List-of-comments Spacy-tokenized version of feeds
tokenizer_wrapper(eng_feeds, 'feed_comment_list')

eng_feeds.to_pickle("eng_feeds_pre_split.pkl")

eng_feeds.head(5)

Tokenizing feeds...


  0%|          | 0/1200 [00:00<?, ?it/s]

Feeds tokenized in 273.94717597961426 seconds


Unnamed: 0_level_0,Unnamed: 1_level_0,timerange,proficiency,feed_tokens_space,comment_lengths,feed_string,feed_comment_list,feed_comment_list_nopunc_lower,feed_comment_list_spacy
author,intra_author_feed_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Brad_Ethan,1,270,N,"[917, right, swipes, and, 119, matches, while,...","[12, 17, 34, 15, 13, 15, 8, 96, 27, 19, 26, 12...",917 right swipes and 119 matches while being t...,"[[917, right, swipes, and, 119, matches, while...","[[917, right, swipes, and, 119, matches, while...","[(917, right, swipes, and, 119, matches, while..."
Brad_Ethan,2,270,N,"[I, don't, get, why, move, on, from, Wentz...,...","[62, 24, 80, 85, 15, 18, 25, 112, 17, 6, 22, 36]",I don't get why move on from Wentz... Replace ...,"[[I, don't, get, why, move, on, from, Wentz......","[[i, dont, get, why, move, on, from, wentz, re...","[(I, do, n't, get, why, move, on, from, Wentz,..."
Brad_Ethan,3,270,N,"[It's, because, we, are, wasting, roster, spot...","[11, 44, 11, 5, 24, 12, 12, 44, 37, 18, 39, 7,...",It's because we are wasting roster spots with ...,"[[It's, because, we, are, wasting, roster, spo...","[[its, because, we, are, wasting, roster, spot...","[(It, 's, because, we, are, wasting, roster, s..."
Brad_Ethan,4,270,N,"[I, mean, we, could, keep, Fischer, and, roll,...","[89, 38, 48, 66, 61, 22, 19, 30, 14, 17, 10, 4...",I mean we could keep Fischer and roll with Fis...,"[[I, mean, we, could, keep, Fischer, and, roll...","[[i, mean, we, could, keep, fischer, and, roll...","[(I, mean, we, could, keep, Fischer, and, roll..."
Brad_Ethan,5,270,N,"[I, don't, think, Skip, actually, means, it., ...","[43, 4, 17, 39, 13, 11, 30, 41, 14, 46, 11, 23...",I don't think Skip actually means it. Isn't hi...,"[[I, don't, think, Skip, actually, means, it.,...","[[i, dont, think, skip, actually, means, it, i...","[(I, do, n't, think, Skip, actually, means, it..."


# Train/Test Split

In [6]:
from feature_extraction_functions import *
team_seed = 13 + 4 + 5
random.seed(team_seed) 

eng_feeds = pd.read_pickle("eng_feeds_pre_split.pkl")

labelencoder = LabelEncoder()

eng_feeds = eng_feeds.reset_index()

t = eng_feeds['author']
t = labelencoder.fit_transform(t)
eng_feeds['author_id'] = t.tolist()

t = eng_feeds['proficiency']
t = labelencoder.fit_transform(t)
eng_feeds['proficiency_id'] = t.tolist()

from sklearn.model_selection import train_test_split
y = eng_feeds[['author_id', 'intra_author_feed_id']]
X_train, X_test, y_train, y_test = train_test_split(eng_feeds, y, test_size=0.10, random_state=team_seed)

# Extract Features

In [7]:
warnings.filterwarnings("ignore")

def extract_features(feeds_aug, train_or_test):
    
    # Number of Comments, Median comment length

    feeds_aug['num_comments'] = feeds_aug['comment_lengths'].apply(len)
    feeds_aug['comment_length_median'] = feeds_aug['comment_lengths'].apply(statistics.median)
    feeds_aug = feeds_aug.drop('comment_lengths', axis=1)

    # Character Count, Alphabet Count & Proportion, Digit Count & Proportion, Punctuation Count & Proportion

    feeds_aug = character_count_proportion_wrapper(feeds_aug, 'feed_string', 'letter')
    feeds_aug = character_count_proportion_wrapper(feeds_aug, 'feed_string', 'digit')
    feeds_aug = character_count_proportion_wrapper(feeds_aug, 'feed_string', 'punctuation')
    feeds_aug = character_count_proportion_wrapper(feeds_aug, 'feed_string', 'whitespace')
    feeds_aug = character_count_proportion_wrapper(feeds_aug, 'feed_string', 'character')
    
    # Word Count, Average Word Length, Word Length Distribution (Freq of words length 1-20 letters), Word Case Distribution (All lowercase / First-upper-rest-lowercase / All uppercase / Other), Character case distribution (lowercase / uppercase)

    word_count_wrapper(feeds_aug, 'feed_string')
    word_length_avg_wrapper(feeds_aug, 'feed_string')
    word_length_distribution_wrapper(feeds_aug, 'feed_string')
    word_short_prop_wrapper(feeds_aug, 'word_length_distribution', 'word_count')
    letter_case_distribution_wrapper(feeds_aug, 'feed_string')
    word_case_distribution_wrapper(feeds_aug, 'feed_string')

    # Misspellings Prop

    misspelled_prop_wrapper(feeds_aug, 'feed_string', 'misspelled_prop')
    
    #Stop Word proportion of Tokens
    stop_words_proportion_wrapper(feeds_aug, 'feed_comment_list_spacy', 'stop_words_proportion')
    
    # Vocabulary Richness: Hapax Legomena Proportion of Total Tokens, Hapax Legomena Proportion of Unique Tokens, Unique Tokens over Total Tokens
    # # https://eprints.qut.edu.au/8019/1/8019.pdf
    
    hapax_legomena_proportion_wrapper(feeds_aug, 'feed_comment_list_spacy', 'hapax_legomena_prop_tot_tokens', 'total') # Note: ignores stop words
    hapax_legomena_proportion_wrapper(feeds_aug, 'feed_comment_list_spacy', 'hapax_legomena_prop_unique_tokens', 'unique') # Note: ignores stop words
    token_type_ratio_wrapper(feeds_aug, 'feed_comment_list_spacy', 'token_type_ratio') # Note: ignores stop words

    if train_or_test == "train":
        
        global letter_1gram_collection_fromtrain
        global letter_2gram_collection_fromtrain
        global letter_3gram_collection_fromtrain
        global letter_4gram_collection_fromtrain
        global digit_1gram_collection_fromtrain
        global digit_2gram_collection_fromtrain
        global digit_3gram_collection_fromtrain
        global punctuation_1gram_collection_fromtrain
        global punctuation_2gram_collection_fromtrain
        global punctuation_3gram_collection_fromtrain
        global word_1gram_collection_fromtrain
        global word_2gram_collection_fromtrain
        global POS_tags_1gram_collection_fromtrain
        global POS_tags_2gram_collection_fromtrain
        global POS_tags_3gram_collection_fromtrain
        
        # Letter, Digit, and Punctuation n-grams
        letter_1gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_1gram', 1, 'letter')
        letter_2gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_2gram', 2, 'letter')
        letter_3gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_3gram', 3, 'letter')
        #letter_4gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_4gram', 4, 'letter')

        digit_1gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'digit_1gram', 1, 'digit')
        digit_2gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'digit_2gram', 2, 'digit')
        digit_3gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'digit_3gram', 3, 'digit')

        punctuation_1gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'punctuation_1gram', 1, 'punctuation')
        punctuation_2gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'punctuation_2gram', 2, 'punctuation')
        #punctuation_3gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'punctuation_3gram', 3, 'punctuation')

        # Word ngrams
        word_1gram_collection_fromtrain = word_ngrams_wrapper(feeds_aug, 'feed_comment_list_nopunc_lower', 'word_1gram', 1)
        #word_2gram_collection_fromtrain = word_ngrams_wrapper(feeds_aug, 'feed_comment_list_nopunc_lower', 'word_1gram', 2)

        # POS n-grams
        POS_tags_1gram_collection_fromtrain = POS_tags_ngram_wrapper(feeds_aug, 'feed_comment_list_spacy', 'POS_tag_1gram', 1)
        POS_tags_2gram_collection_fromtrain = POS_tags_ngram_wrapper(feeds_aug, 'feed_comment_list_spacy', 'POS_tag_2gram', 2)
        #POS_tags_3gram_collection_fromtrain = POS_tags_ngram_wrapper(feeds_aug, 'feed_comment_list_spacy', 'POS_tag_3gram', 3)

    elif train_or_test == "test":
        
        # Letter, Digit, and Punctuation n-grams
        character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_1gram', 1, 'letter', letter_1gram_collection_fromtrain)
        character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_2gram', 2, 'letter', letter_2gram_collection_fromtrain)
        character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_3gram', 3, 'letter', letter_3gram_collection_fromtrain)
        #character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_4gram', 4, 'letter', letter_4gram_collection_fromtrain)

        character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'digit_1gram', 1, 'digit', digit_1gram_collection_fromtrain)
        character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'digit_2gram', 2, 'digit', digit_2gram_collection_fromtrain)
        character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'digit_3gram', 3, 'digit', digit_3gram_collection_fromtrain)

        character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'punctuation_1gram', 1, 'punctuation', punctuation_1gram_collection_fromtrain)
        character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'punctuation_2gram', 2, 'punctuation', punctuation_2gram_collection_fromtrain)
        #character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'punctuation_3gram', 3, 'punctuation', punctuation_3gram_collection_fromtrain)

        # Word ngrams
        word_ngrams_wrapper(feeds_aug, 'feed_comment_list_nopunc_lower', 'word_1gram', 1, word_1gram_collection_fromtrain)
        #word_ngrams_wrapper(feeds_aug, 'feed_comment_list_nopunc_lower', 'word_1gram', 2, word_2gram_collection_fromtrain)

        # POS n-grams
        POS_tags_ngram_wrapper(feeds_aug, 'feed_comment_list_spacy', 'POS_tag_1gram', 1, POS_tags_1gram_collection_fromtrain)
        POS_tags_ngram_wrapper(feeds_aug, 'feed_comment_list_spacy', 'POS_tag_2gram', 2, POS_tags_2gram_collection_fromtrain)
        #POS_tags_ngram_wrapper(feeds_aug, 'feed_comment_list_spacy', 'POS_tag_3gram', 3, POS_tags_3gram_collection_fromtrain)

    return feeds_aug

X_train = extract_features(X_train, "train")
X_test = extract_features(X_test, "test")

warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None

def expand_feature_set(dataframe):
    dataframe = dataframe[['proficiency', 'comment_length_median', 'letter_prop', 'digit_prop', 'punctuation_prop', 'whitespace_prop', 'word_length_avg', 'word_length_distribution', 'word_short_prop', 'letter_case_distribution', 'word_case_distribution', 'misspelled_prop', 'stop_words_proportion', 'hapax_legomena_prop_tot_tokens', 'hapax_legomena_prop_unique_tokens', 'token_type_ratio', 'letter_1gram', 'letter_2gram', 'letter_3gram', 'digit_1gram', 'digit_2gram', 'digit_3gram', 'punctuation_1gram', 'punctuation_2gram', 'word_1gram', 'POS_tag_1gram', 'POS_tag_2gram']]
    for col in dataframe.columns:
        if type(dataframe[col].iloc[0]) == list:
            newcols = [col + "_" + str(i) for i in range(1,len(dataframe[col].iloc[0]) + 1)]
            dataframe[newcols] = pd.DataFrame(dataframe[col].tolist(), index= dataframe.index)
            dataframe = dataframe.drop([col], axis = 1)
        elif type(dataframe[col].iloc[0]) == str and col != 'proficiency':
            dataframe[col] = pd.to_numeric(dataframe[col], downcast="float")
            
    return dataframe

X_train = expand_feature_set(X_train)
X_test = expand_feature_set(X_test)

X_train.to_pickle("X_train_w_features.pkl")
X_test.to_pickle("X_test_w_features.pkl")
y_train.to_pickle("y_train.pkl")
y_test.to_pickle("y_test.pkl")

Performing letter count & proportion...
Performed letter count & proportion in 3.086843490600586 seconds
Performing digit count & proportion...
Performed digit count & proportion in 1.0787100791931152 seconds
Performing punctuation count & proportion...
Performed punctuation count & proportion in 2.15299391746521 seconds
Performing whitespace count & proportion...
Performed whitespace count & proportion in 0.8084206581115723 seconds
Performing character count & proportion...
Performed character count & proportion in 0.038818359375 seconds
Performing word count...
Performed word count in 0.19438815116882324 seconds
Performing word length avg...
Performed word length avg in 0.26510119438171387 seconds
Performing word length distribution...
Performed word length distribution in 0.3770272731781006 seconds
Performing word count short...
Performed word count short in 0.0142059326171875 seconds
Performing letter case distribution...
Performed letter case distribution in 0.5064380168914795 sec

  0%|          | 0/1080 [00:00<?, ?it/s]

Performed misspellings proportion in 1216.4242560863495 seconds
Performing stop words ratio...
Performed stop words ratio in 3.233867883682251 seconds
Performing hapax legomena proportion of total tokens...
Performed hapax legomena proportion of total tokens in 2.8483874797821045 seconds
Performing hapax legomena proportion of unique tokens...
Performed hapax legomena proportion of unique tokens in 2.943195104598999 seconds
Performing token type ratio...
Performed token type ratio in 3.6032187938690186 seconds
Performing train letter 1-gram...
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


  0%|          | 0/1080 [00:00<?, ?it/s]

Performed train letter 1-gram in 7.650003910064697 seconds
Returned up to 50 most common letter 1-grams for feature extraction on test set.
Performing train letter 2-gram...
['al', 'an', 'ar', 'as', 'at', 'be', 'ca', 'co', 'de', 'ea', 'ed', 'el', 'en', 'er', 'es', 'ha', 'he', 'hi', 'in', 'is', 'it', 'le', 'li', 'll', 'ly', 'me', 'nd', 'ne', 'ng', 'no', 'nt', 'of', 'om', 'on', 'or', 'ot', 'ou', 're', 'ri', 'se', 'so', 'st', 'te', 'th', 'ti', 'to', 'us', 'ut', 've', 'yo']


  0%|          | 0/1080 [00:00<?, ?it/s]

Performed train letter 2-gram in 16.159202575683594 seconds
Returned up to 50 most common letter 2-grams for feature extraction on test set.
Performing train letter 3-gram...
['all', 'and', 'are', 'ate', 'ati', 'ave', 'but', 'can', 'com', 'con', 'ear', 'ent', 'ere', 'ers', 'eve', 'for', 'hat', 'hav', 'her', 'hey', 'hin', 'his', 'ike', 'ing', 'ion', 'ith', 'ive', 'lly', 'nce', 'not', 'ome', 'one', 'ore', 'oul', 'our', 'out', 'ple', 'rea', 'som', 'ter', 'tha', 'the', 'thi', 'tio', 'uld', 'use', 'ust', 'ver', 'wit', 'you']


  0%|          | 0/1080 [00:00<?, ?it/s]

Performed train letter 3-gram in 325.5478582382202 seconds
Returned up to 50 most common letter 3-grams for feature extraction on test set.
Performing train digit 1-gram...
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


  0%|          | 0/1080 [00:00<?, ?it/s]

Performed train digit 1-gram in 3.48246693611145 seconds
Returned up to 50 most common digit 1-grams for feature extraction on test set.
Performing train digit 2-gram...
['00', '01', '02', '04', '05', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '35', '40', '45', '47', '50', '52', '60', '65', '70', '75', '78', '79', '80', '90', '93', '95', '97', '98', '99']


  0%|          | 0/1080 [00:00<?, ?it/s]

Performed train digit 2-gram in 60.71894383430481 seconds
Returned up to 50 most common digit 2-grams for feature extraction on test set.
Performing train digit 3-gram...
['000', '001', '010', '013', '014', '015', '016', '017', '018', '019', '020', '021', '022', '080', '090', '100', '102', '112', '120', '125', '130', '147', '150', '160', '166', '170', '180', '190', '195', '197', '198', '199', '200', '201', '202', '210', '220', '250', '300', '320', '350', '400', '500', '600', '700', '707', '716', '800', '900', '950']


  0%|          | 0/1080 [00:00<?, ?it/s]

Performed train digit 3-gram in 29.81190252304077 seconds
Returned up to 50 most common digit 3-grams for feature extraction on test set.
Performing train punctuation 1-gram...
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '’', '“', '”']


  0%|          | 0/1080 [00:00<?, ?it/s]

Performed train punctuation 1-gram in 4.676625490188599 seconds
Returned up to 50 most common punctuation 1-grams for feature extraction on test set.
Performing train punctuation 2-gram...
['!!', '!"', '!)', '!<', '""', '")', '",', '".', '"?', '%.', "'.", '("', '(/', '))', '),', ').', '):', '**', '*,', '*.', ',"', '--', '->', '."', '.)', '.*', '.,', '..', '.]', '.”', '/)', '//', ':(', ':)', ':/', ';)', '>!', '?!', '?"', '?)', '??', '?]', '?”', '\\*', '\\_', '](', '__', '~~', '”,', '”.']


  0%|          | 0/1080 [00:00<?, ?it/s]

Performed train punctuation 2-gram in 47.17766237258911 seconds
Returned up to 50 most common punctuation 2-grams for feature extraction on test set.
Performing train word 1-gram...
[('a',), ('about',), ('all',), ('an',), ('and',), ('are',), ('as',), ('at',), ('be',), ('but',), ('can',), ('do',), ('dont',), ('for',), ('from',), ('have',), ('i',), ('if',), ('im',), ('in',), ('is',), ('it',), ('its',), ('just',), ('like',), ('me',), ('more',), ('my',), ('not',), ('of',), ('on',), ('one',), ('or',), ('people',), ('so',), ('some',), ('that',), ('the',), ('there',), ('they',), ('this',), ('to',), ('was',), ('we',), ('what',), ('when',), ('with',), ('would',), ('you',), ('your',)]


  0%|          | 0/1080 [00:00<?, ?it/s]

Performed train word 1-gram in 123.85393977165222 seconds
Returned up to 50 most common word1-grams for feature extraction on test set.
Performing train POS tags 1-grams...
[('$',), ("''",), (',',), ('-LRB-',), ('-RRB-',), ('.',), (':',), ('ADD',), ('AFX',), ('CC',), ('CD',), ('DT',), ('EX',), ('FW',), ('HYPH',), ('IN',), ('JJ',), ('JJR',), ('JJS',), ('LS',), ('MD',), ('NFP',), ('NN',), ('NNP',), ('NNPS',), ('NNS',), ('PDT',), ('POS',), ('PRP',), ('PRP$',), ('RB',), ('RBR',), ('RBS',), ('RP',), ('SYM',), ('TO',), ('UH',), ('VB',), ('VBD',), ('VBG',), ('VBN',), ('VBP',), ('VBZ',), ('WDT',), ('WP',), ('WP$',), ('WRB',), ('XX',), ('_SP',), ('``',)]


  0%|          | 0/1080 [00:00<?, ?it/s]

Performed train POS tags 1-gram in 3.667933464050293 seconds
Returned up to 50 most common POS tags 1-grams for feature extraction on test set.
Performing train POS tags 2-grams...
[(',', 'CC'), (',', 'PRP'), ('.', 'PRP'), ('.', 'RB'), ('CC', 'PRP'), ('DT', 'JJ'), ('DT', 'NN'), ('DT', 'NNS'), ('IN', 'DT'), ('IN', 'JJ'), ('IN', 'NN'), ('IN', 'NNP'), ('IN', 'NNS'), ('IN', 'PRP'), ('IN', 'PRP$'), ('JJ', '.'), ('JJ', 'IN'), ('JJ', 'NN'), ('JJ', 'NNS'), ('MD', 'RB'), ('MD', 'VB'), ('NN', ','), ('NN', '.'), ('NN', 'CC'), ('NN', 'IN'), ('NN', 'NN'), ('NN', 'RB'), ('NN', 'VBZ'), ('NNP', 'NNP'), ('NNS', '.'), ('NNS', 'IN'), ('PRP', 'MD'), ('PRP', 'RB'), ('PRP', 'VBD'), ('PRP', 'VBP'), ('PRP', 'VBZ'), ('PRP$', 'NN'), ('RB', '.'), ('RB', 'IN'), ('RB', 'JJ'), ('RB', 'RB'), ('RB', 'VB'), ('TO', 'VB'), ('VB', 'DT'), ('VB', 'IN'), ('VB', 'PRP'), ('VBN', 'IN'), ('VBP', 'RB'), ('VBZ', 'DT'), ('VBZ', 'RB')]


  0%|          | 0/1080 [00:00<?, ?it/s]

Performed train POS tags 2-gram in 20.16659665107727 seconds
Returned up to 50 most common POS tags 2-grams for feature extraction on test set.
Performing letter count & proportion...
Performed letter count & proportion in 0.7388501167297363 seconds
Performing digit count & proportion...
Performed digit count & proportion in 0.11131525039672852 seconds
Performing punctuation count & proportion...
Performed punctuation count & proportion in 0.2633216381072998 seconds
Performing whitespace count & proportion...
Performed whitespace count & proportion in 0.07697796821594238 seconds
Performing character count & proportion...
Performed character count & proportion in 0.0057752132415771484 seconds
Performing word count...
Performed word count in 0.01598215103149414 seconds
Performing word length avg...
Performed word length avg in 0.021021604537963867 seconds
Performing word length distribution...
Performed word length distribution in 0.03286552429199219 seconds
Performing word count short..

  0%|          | 0/120 [00:00<?, ?it/s]

Performed misspellings proportion in 134.4947702884674 seconds
Performing stop words ratio...
Performed stop words ratio in 0.41547203063964844 seconds
Performing hapax legomena proportion of total tokens...
Performed hapax legomena proportion of total tokens in 0.3055686950683594 seconds
Performing hapax legomena proportion of unique tokens...
Performed hapax legomena proportion of unique tokens in 0.3615293502807617 seconds
Performing token type ratio...
Performed token type ratio in 1.0016300678253174 seconds
Performing test letter 1-gram...
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


  0%|          | 0/120 [00:00<?, ?it/s]

Performed test letter 1-gram in 0.30677103996276855 seconds
Performing test letter 2-gram...
['al', 'an', 'ar', 'as', 'at', 'be', 'ca', 'co', 'de', 'ea', 'ed', 'el', 'en', 'er', 'es', 'ha', 'he', 'hi', 'in', 'is', 'it', 'le', 'li', 'll', 'ly', 'me', 'nd', 'ne', 'ng', 'no', 'nt', 'of', 'om', 'on', 'or', 'ot', 'ou', 're', 'ri', 'se', 'so', 'st', 'te', 'th', 'ti', 'to', 'us', 'ut', 've', 'yo']


  0%|          | 0/120 [00:00<?, ?it/s]

Performed test letter 2-gram in 0.20581889152526855 seconds
Performing test letter 3-gram...
['all', 'and', 'are', 'ate', 'ati', 'ave', 'but', 'can', 'com', 'con', 'ear', 'ent', 'ere', 'ers', 'eve', 'for', 'hat', 'hav', 'her', 'hey', 'hin', 'his', 'ike', 'ing', 'ion', 'ith', 'ive', 'lly', 'nce', 'not', 'ome', 'one', 'ore', 'oul', 'our', 'out', 'ple', 'rea', 'som', 'ter', 'tha', 'the', 'thi', 'tio', 'uld', 'use', 'ust', 'ver', 'wit', 'you']


  0%|          | 0/120 [00:00<?, ?it/s]

Performed test letter 3-gram in 0.21407389640808105 seconds
Performing test digit 1-gram...
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


  0%|          | 0/120 [00:00<?, ?it/s]

Performed test digit 1-gram in 0.33611416816711426 seconds
Performing test digit 2-gram...
['00', '01', '02', '04', '05', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '35', '40', '45', '47', '50', '52', '60', '65', '70', '75', '78', '79', '80', '90', '93', '95', '97', '98', '99']


  0%|          | 0/120 [00:00<?, ?it/s]

Performed test digit 2-gram in 0.2661018371582031 seconds
Performing test digit 3-gram...
['000', '001', '010', '013', '014', '015', '016', '017', '018', '019', '020', '021', '022', '080', '090', '100', '102', '112', '120', '125', '130', '147', '150', '160', '166', '170', '180', '190', '195', '197', '198', '199', '200', '201', '202', '210', '220', '250', '300', '320', '350', '400', '500', '600', '700', '707', '716', '800', '900', '950']


  0%|          | 0/120 [00:00<?, ?it/s]

Performed test digit 3-gram in 0.2402050495147705 seconds
Performing test punctuation 1-gram...
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '’', '“', '”']


  0%|          | 0/120 [00:00<?, ?it/s]

Performed test punctuation 1-gram in 0.2797088623046875 seconds
Performing test punctuation 2-gram...
['!!', '!"', '!)', '!<', '""', '")', '",', '".', '"?', '%.', "'.", '("', '(/', '))', '),', ').', '):', '**', '*,', '*.', ',"', '--', '->', '."', '.)', '.*', '.,', '..', '.]', '.”', '/)', '//', ':(', ':)', ':/', ';)', '>!', '?!', '?"', '?)', '??', '?]', '?”', '\\*', '\\_', '](', '__', '~~', '”,', '”.']


  0%|          | 0/120 [00:00<?, ?it/s]

Performed test punctuation 2-gram in 0.24885940551757812 seconds
Performing test word 1-gram...
[('a',), ('about',), ('all',), ('an',), ('and',), ('are',), ('as',), ('at',), ('be',), ('but',), ('can',), ('do',), ('dont',), ('for',), ('from',), ('have',), ('i',), ('if',), ('im',), ('in',), ('is',), ('it',), ('its',), ('just',), ('like',), ('me',), ('more',), ('my',), ('not',), ('of',), ('on',), ('one',), ('or',), ('people',), ('so',), ('some',), ('that',), ('the',), ('there',), ('they',), ('this',), ('to',), ('was',), ('we',), ('what',), ('when',), ('with',), ('would',), ('you',), ('your',)]


  0%|          | 0/120 [00:00<?, ?it/s]

Performed test word 1-gram in 0.2599010467529297 seconds
Performing test POS tags 1-gram...
[('$',), ("''",), (',',), ('-LRB-',), ('-RRB-',), ('.',), (':',), ('ADD',), ('AFX',), ('CC',), ('CD',), ('DT',), ('EX',), ('FW',), ('HYPH',), ('IN',), ('JJ',), ('JJR',), ('JJS',), ('LS',), ('MD',), ('NFP',), ('NN',), ('NNP',), ('NNPS',), ('NNS',), ('PDT',), ('POS',), ('PRP',), ('PRP$',), ('RB',), ('RBR',), ('RBS',), ('RP',), ('SYM',), ('TO',), ('UH',), ('VB',), ('VBD',), ('VBG',), ('VBN',), ('VBP',), ('VBZ',), ('WDT',), ('WP',), ('WP$',), ('WRB',), ('XX',), ('_SP',), ('``',)]


  0%|          | 0/120 [00:00<?, ?it/s]

Performed test POS tags 1-gram in 0.2746999263763428 seconds
Performing test POS tags 2-gram...
[(',', 'CC'), (',', 'PRP'), ('.', 'PRP'), ('.', 'RB'), ('CC', 'PRP'), ('DT', 'JJ'), ('DT', 'NN'), ('DT', 'NNS'), ('IN', 'DT'), ('IN', 'JJ'), ('IN', 'NN'), ('IN', 'NNP'), ('IN', 'NNS'), ('IN', 'PRP'), ('IN', 'PRP$'), ('JJ', '.'), ('JJ', 'IN'), ('JJ', 'NN'), ('JJ', 'NNS'), ('MD', 'RB'), ('MD', 'VB'), ('NN', ','), ('NN', '.'), ('NN', 'CC'), ('NN', 'IN'), ('NN', 'NN'), ('NN', 'RB'), ('NN', 'VBZ'), ('NNP', 'NNP'), ('NNS', '.'), ('NNS', 'IN'), ('PRP', 'MD'), ('PRP', 'RB'), ('PRP', 'VBD'), ('PRP', 'VBP'), ('PRP', 'VBZ'), ('PRP$', 'NN'), ('RB', '.'), ('RB', 'IN'), ('RB', 'JJ'), ('RB', 'RB'), ('RB', 'VB'), ('TO', 'VB'), ('VB', 'DT'), ('VB', 'IN'), ('VB', 'PRP'), ('VBN', 'IN'), ('VBP', 'RB'), ('VBZ', 'DT'), ('VBZ', 'RB')]


  0%|          | 0/120 [00:00<?, ?it/s]

Performed test POS tags 2-gram in 0.3152785301208496 seconds


# Train Models

In [8]:
# https://www.baeldung.com/cs/svm-multiclass-classification

In [34]:
# Import train and test data w features. Split out proficiency.
X_train = pd.read_pickle('X_train_w_features.pkl')
train_proficiency = X_train['proficiency']
X_train = X_train.drop(['proficiency'], axis = 1)

X_test = pd.read_pickle('X_test_w_features.pkl')
test_proficiency = X_test['proficiency']
X_test = X_test.drop(['proficiency'], axis = 1)

# Train SVM models
rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1).fit(X_train, y_train['author_id'])
poly = svm.SVC(kernel='poly', degree=3, C=1).fit(X_train, y_train['author_id'])
linear = svm.SVC(kernel='linear', degree=3, C=1).fit(X_train, y_train['author_id'])

# Split test into native and nonnative feeds
X_test_all_proficiencies = X_test.merge(test_proficiency, left_index=True, right_index=True, how='inner')

X_test_native = X_test_all_proficiencies.loc[X_test_all_proficiencies['proficiency'] == "N"]
y_test_native = y_test['author_id'].loc[X_test_all_proficiencies['proficiency'] == "N"]
X_test_native = X_test_native.drop(['proficiency'], axis = 1)

X_test_nonnat = X_test_all_proficiencies.loc[X_test_all_proficiencies['proficiency'] == "L"]
y_test_nonnat = y_test['author_id'].loc[X_test_all_proficiencies['proficiency'] == "L"]
X_test_nonnat = X_test_nonnat.drop(['proficiency'], axis = 1)

# Predict native and non-native mode performance
def predict_and_evaluate_svm(model):
    # Predict native and non-native
    pred_all = model.predict(X_test)
    pred_native = model.predict(X_test_native)
    pred_nonnat = model.predict(X_test_nonnat)

    accuracy_all = accuracy_score(y_test['author_id'], pred_all)
    accuracy_native = accuracy_score(y_test_native, pred_native)
    accuracy_nonnat = accuracy_score(y_test_nonnat, pred_nonnat)
    f1_all = f1_score(y_test['author_id'], pred_all, average='weighted')
    f1_native = f1_score(y_test_native, pred_native, average='weighted')
    f1_nonnat = f1_score(y_test_nonnat, pred_nonnat, average='weighted')
    
    print('Results for SVM with ' + model.kernel + ' Kernel:')
    print(' ')
    print('- Accuracy')
    print('-- Overall: ', "%.2f" % (accuracy_all*100))
    print('-- Native:', "%.2f" % (accuracy_native*100))
    print('-- Non-Native:', "%.2f" % (accuracy_nonnat*100))
    print(' ')
    print('- F1 Score')
    print('-- Overall:', "%.2f" % (f1_all*100))
    print('-- Native:', "%.2f" % (f1_native*100))
    print('-- Non-Native:', "%.2f" % (f1_nonnat*100))
    print(' ')
    print(' ')
    print('Results are for 15 native + 15 non-native english authors. Model trained on 1,080 feeds and tested on 120 feeds.') 
    print(' ')
    print(' ')
    print(' ')
    print(' ')
    
predict_and_evaluate_svm(linear)
predict_and_evaluate_svm(poly)
predict_and_evaluate_svm(rbf)

Results for SVM with linear Kernel:
 
- Accuracy
-- Overall:  60.00
-- Native: 66.67
-- Non-Native: 51.85
 
- F1 Score
-- Overall: 59.03
-- Native: 70.25
-- Non-Native: 56.49
 
 
Results are for 15 native + 15 non-native english authors. Model trained on 1,080 feeds and tested on 120 feeds.
 
 
 
 
Results for SVM with poly Kernel:
 
- Accuracy
-- Overall:  2.50
-- Native: 4.55
-- Non-Native: 0.00
 
- F1 Score
-- Overall: 1.45
-- Native: 2.84
-- Non-Native: 0.00
 
 
Results are for 15 native + 15 non-native english authors. Model trained on 1,080 feeds and tested on 120 feeds.
 
 
 
 
Results for SVM with rbf Kernel:
 
- Accuracy
-- Overall:  1.67
-- Native: 0.00
-- Non-Native: 3.70
 
- F1 Score
-- Overall: 0.35
-- Native: 0.00
-- Non-Native: 1.30
 
 
Results are for 15 native + 15 non-native english authors. Model trained on 1,080 feeds and tested on 120 feeds.
 
 
 
 
