# Sample Authors and Prepare Feeds for Feature Extraction

In [1]:
# May need to install packages seen in feature_extraction_functions
from feature_extraction_functions import *
team_seed = 13 + 4 + 5
random.seed(team_seed) 

# Space-split, long-format version of feeds (Arthur's version + some cleaning and wide-to-long conversion)
eng_native = pd.read_pickle('Data/Classified/native_english_40feeds')
eng_nonnat = pd.read_pickle('Data/Classified/non_native_english_40feeds')
eng_native['proficiency'] = "N" # native
eng_nonnat['proficiency'] = "L" # learner

# Three cohorts will be created based on development split below.
# Cohort 1: 15 Native + 15 Non-Native
# Cohort 2: 15 Native from Cohort 1 + another random 15 native
# Cohort 3: 15 Non-Native from Cohort 2 + another random 15 non-native
# Therefore, we need 30 native and 30 non-native authors for the development cohorts
num_native_authors_to_sample = 30
num_nonnat_authors_to_sample = 30
eng_native_sample = eng_native.sample(num_native_authors_to_sample, random_state=team_seed)
eng_nonnat_sample = eng_nonnat.sample(num_nonnat_authors_to_sample, random_state=team_seed)

### ARTHUR REMOVE THIS ONCE YOU LIMIT TO 20 FEEDS PER AUTHOR ###
eng_native_sample = eng_native_sample[['timerange', 'file1', 'slices1', 'file2', 'slices2', 'file3', 'slices3',
       'file4', 'slices4', 'file5', 'slices5', 'file6', 'slices6', 'file7',
       'slices7', 'file8', 'slices8', 'file9', 'slices9', 'file10', 'slices10',
       'file11', 'slices11', 'file12', 'slices12', 'file13', 'slices13',
       'file14', 'slices14', 'file15', 'slices15', 'file16', 'slices16',
       'file17', 'slices17', 'file18', 'slices18', 'file19', 'slices19',
       'file20', 'slices20', 'proficiency']]

eng_nonnat_sample = eng_nonnat_sample[['timerange', 'file1', 'slices1', 'file2', 'slices2', 'file3', 'slices3',
       'file4', 'slices4', 'file5', 'slices5', 'file6', 'slices6', 'file7',
       'slices7', 'file8', 'slices8', 'file9', 'slices9', 'file10', 'slices10',
       'file11', 'slices11', 'file12', 'slices12', 'file13', 'slices13',
       'file14', 'slices14', 'file15', 'slices15', 'file16', 'slices16',
       'file17', 'slices17', 'file18', 'slices18', 'file19', 'slices19',
       'file20', 'slices20', 'proficiency']]

# Before splitting into cohorts, perform all pre-feature-extraction processing
eng_feeds = pd.concat([eng_nonnat_sample, eng_native_sample], ignore_index=False, axis=0) # 
eng_feeds['author'] = eng_feeds.index
eng_feeds = pd.wide_to_long(eng_feeds, ["file", "slices"], i="author", j="intra_author_feed_id").sort_index()
eng_feeds = eng_feeds.rename(columns={"slices": "comment_lengths", "file": "feed_tokens_space"})

# Raw string version of feeds
def feed_string(feed):
    return ' '.join(feed)
eng_feeds['feed_string'] = eng_feeds['feed_tokens_space'].apply(feed_string)

# List-of-comments version of feeds
## Doing this via loop because two columns involved in function instead of one...
def create_comment_word_indices(comment_lengths):
    np_comment_lengths = np.array(comment_lengths)
    return np.cumsum(np_comment_lengths)
eng_feeds['comment_word_indices'] = eng_feeds['comment_lengths'].apply(create_comment_word_indices)

eng_feeds['feed_comment_list'] = ""
for index, row in eng_feeds.iterrows():
    comm_w_indices_temp = row['comment_word_indices']
    feed_tokens_space_temp = row['feed_tokens_space']
    inner_list = []
    for i in range(len(comm_w_indices_temp)):
        if i == 0:
            inner_list.append(feed_tokens_space_temp[0:comm_w_indices_temp[i]])
        else:
            inner_list.append(feed_tokens_space_temp[comm_w_indices_temp[i-1]:comm_w_indices_temp[i]])
    eng_feeds.at[index,'feed_comment_list'] = inner_list
eng_feeds = eng_feeds.drop('comment_word_indices', axis=1)

# List-of-comments w/ punctuation stripped and lowercase applied version of feeds
def strip_punc_and_lower_nested_list(feed_comment_list):
    feed_comment_list_nopunc_lower = []
    for comment in feed_comment_list:
        feed_comment_list_nopunc_lower.append(re.sub(r'[^A-Za-z0-9 ]+', '', ' '.join(comment)).lower().split()) 
    return feed_comment_list_nopunc_lower
eng_feeds['feed_comment_list_nopunc_lower'] = eng_feeds['feed_comment_list'].apply(strip_punc_and_lower_nested_list)

# List-of-comments Spacy-tokenized version of feeds
tokenizer_wrapper(eng_feeds, 'feed_comment_list')

eng_feeds.to_pickle("eng_development_feeds_pre_split.pkl")

eng_feeds.head(5)

ModuleNotFoundError: No module named 'nltk'

# Encode author and proficiency levels, Split into Three Development Cohorts

In [None]:
from feature_extraction_functions import *
from random import sample
team_seed = 13 + 4 + 5
random.seed(team_seed) 

eng_feeds = pd.read_pickle("eng_development_feeds_pre_split.pkl")

# Encode author and proficiency as numbers
labelencoder = LabelEncoder()
eng_feeds = eng_feeds.reset_index()
t = eng_feeds['author']
t = labelencoder.fit_transform(t)
eng_feeds['author_id'] = t.tolist()
t = eng_feeds['proficiency']
t = labelencoder.fit_transform(t)
eng_feeds['proficiency_id'] = t.tolist()

[nltk_data] Downloading package wordnet to /home/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /home/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2022-12-08 13:05:52.707667: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-08 13:05:59.534712: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-08 13:05:59.534753: I tensorflow/compiler/xla/stream_execu

In [None]:
# Split 60 authors into three cohorts
# Cohort 1: 15 native + 15 non-native
native_authors = list(set(eng_feeds[eng_feeds['proficiency'] == "N"]['author_id'].values))
nonnat_authors = list(set(eng_feeds[eng_feeds['proficiency'] == "L"]['author_id'].values))
all_cohort_native_subset = sample(native_authors, int(num_native_authors_to_sample / 2))
all_cohort_nonnat_subset = sample(nonnat_authors, int(num_nonnat_authors_to_sample / 2))
cohort_all = pd.concat([eng_feeds[eng_feeds['author_id'].isin(all_cohort_native_subset)], eng_feeds[eng_feeds['author_id'].isin(all_cohort_nonnat_subset)]], ignore_index=False, axis=0)
# Cohort 2: 30 native
cohort_native = eng_feeds[eng_feeds['proficiency'] == "N"]
# Cohort 3: 30 non-native
cohort_nonnat = eng_feeds[eng_feeds['proficiency'] == "L"]

# Extract Features

In [None]:
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None

team_seed = 13 + 4 + 5

def extract_features(cohort, filetag):
    
    y = cohort[['author_id', 'intra_author_feed_id']]
    X_train, X_test, y_train, y_test = train_test_split(cohort, y, test_size=0.10, stratify = y['author_id'], random_state=team_seed)

    for stage in ["train", "test"]:
        if stage == "train":
            feeds_aug = X_train
        elif stage == "test":
            feeds_aug = X_test
            
        # Number of Comments, Median comment length

        feeds_aug['num_comments'] = feeds_aug['comment_lengths'].apply(len)
        feeds_aug['comment_length_median'] = feeds_aug['comment_lengths'].apply(statistics.median)
        feeds_aug = feeds_aug.drop('comment_lengths', axis=1)

        # Character Count, Alphabet Count & Proportion, Digit Count & Proportion, Punctuation Count & Proportion

        feeds_aug = character_count_proportion_wrapper(feeds_aug, 'feed_string', 'letter')
        feeds_aug = character_count_proportion_wrapper(feeds_aug, 'feed_string', 'digit')
        feeds_aug = character_count_proportion_wrapper(feeds_aug, 'feed_string', 'punctuation')
        feeds_aug = character_count_proportion_wrapper(feeds_aug, 'feed_string', 'whitespace')
        feeds_aug = character_count_proportion_wrapper(feeds_aug, 'feed_string', 'character')
    
        # Word Count, Average Word Length, Word Length Distribution (Freq of words length 1-20 letters), Word Case Distribution (All lowercase / First-upper-rest-lowercase / All uppercase / Other), Character case distribution (lowercase / uppercase)

        word_count_wrapper(feeds_aug, 'feed_string')
        word_length_avg_wrapper(feeds_aug, 'feed_string')
        word_length_distribution_wrapper(feeds_aug, 'feed_string')
        word_short_prop_wrapper(feeds_aug, 'word_length_distribution', 'word_count')
        letter_case_distribution_wrapper(feeds_aug, 'feed_string')
        word_case_distribution_wrapper(feeds_aug, 'feed_string')

        # Misspellings Prop

        misspelled_prop_wrapper(feeds_aug, 'feed_string', 'misspelled_prop')
    
        #Stop Word proportion of Tokens
        stop_words_proportion_wrapper(feeds_aug, 'feed_comment_list_spacy', 'stop_words_proportion')
    
        # Vocabulary Richness: Hapax Legomena Proportion of Total Tokens, Hapax Legomena Proportion of Unique Tokens, Unique Tokens over Total Tokens
        # # https://eprints.qut.edu.au/8019/1/8019.pdf
    
        hapax_legomena_proportion_wrapper(feeds_aug, 'feed_comment_list_spacy', 'hapax_legomena_prop_tot_tokens', 'total') # Note: ignores stop words
        hapax_legomena_proportion_wrapper(feeds_aug, 'feed_comment_list_spacy', 'hapax_legomena_prop_unique_tokens', 'unique') # Note: ignores stop words
        token_type_ratio_wrapper(feeds_aug, 'feed_comment_list_spacy', 'token_type_ratio') # Note: ignores stop words

        if stage == "train":
        
            global letter_1gram_collection_fromtrain
            global letter_2gram_collection_fromtrain
            global letter_3gram_collection_fromtrain
            global letter_4gram_collection_fromtrain
            global digit_1gram_collection_fromtrain
            global digit_2gram_collection_fromtrain
            global digit_3gram_collection_fromtrain
            global punctuation_1gram_collection_fromtrain
            global punctuation_2gram_collection_fromtrain
            global punctuation_3gram_collection_fromtrain
            global word_1gram_collection_fromtrain
            global word_2gram_collection_fromtrain
            global POS_tags_1gram_collection_fromtrain
            global POS_tags_2gram_collection_fromtrain
            global POS_tags_3gram_collection_fromtrain
        
            # Letter, Digit, and Punctuation n-grams
            letter_1gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_1gram', 1, 50, 'letter')
            letter_2gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_2gram', 2, 50, 'letter')
            letter_3gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_3gram', 3, 50, 'letter')
            letter_4gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_4gram', 4, 50, 'letter')

            digit_1gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'digit_1gram', 1, 50, 'digit')
            #digit_2gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'digit_2gram', 2, 50, 'digit')
            #digit_3gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'digit_3gram', 3, 50, 'digit')

            punctuation_1gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'punctuation_1gram', 1, 50, 'punctuation')
            punctuation_2gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'punctuation_2gram', 2, 50, 'punctuation')
            #punctuation_3gram_collection_fromtrain = character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'punctuation_3gram', 3, 50, 'punctuation')

            # Word ngrams
            word_1gram_collection_fromtrain = word_ngrams_wrapper(feeds_aug, 'feed_comment_list_nopunc_lower', 'word_1gram', 1, 50)
            word_2gram_collection_fromtrain = word_ngrams_wrapper(feeds_aug, 'feed_comment_list_nopunc_lower', 'word_1gram', 2, 50)

            # POS n-grams
            POS_tags_1gram_collection_fromtrain = POS_tags_ngram_wrapper(feeds_aug, 'feed_comment_list_spacy', 'POS_tag_1gram', 1, 50)
            POS_tags_2gram_collection_fromtrain = POS_tags_ngram_wrapper(feeds_aug, 'feed_comment_list_spacy', 'POS_tag_2gram', 2, 50)
            #POS_tags_3gram_collection_fromtrain = POS_tags_ngram_wrapper(feeds_aug, 'feed_comment_list_spacy', 'POS_tag_3gram', 3, 50)

        elif stage == "test":
        
            # Letter, Digit, and Punctuation n-grams
            character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_1gram', 1, 50, 'letter', letter_1gram_collection_fromtrain)
            character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_2gram', 2, 50, 'letter', letter_2gram_collection_fromtrain)
            character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_3gram', 3, 50, 'letter', letter_3gram_collection_fromtrain)
            character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'letter_4gram', 4, 50, 'letter', letter_4gram_collection_fromtrain)

            character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'digit_1gram', 1, 50, 'digit', digit_1gram_collection_fromtrain)
            #character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'digit_2gram', 2, 50, 'digit', digit_2gram_collection_fromtrain)
            #character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'digit_3gram', 3, 50, 'digit', digit_3gram_collection_fromtrain)

            character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'punctuation_1gram', 1, 50, 'punctuation', punctuation_1gram_collection_fromtrain)
            character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'punctuation_2gram', 2, 50, 'punctuation', punctuation_2gram_collection_fromtrain)
            #character_ngrams_wrapper(feeds_aug, 'feed_tokens_space', 'punctuation_3gram', 3, 50, 'punctuation', punctuation_3gram_collection_fromtrain)

            # Word ngrams
            word_ngrams_wrapper(feeds_aug, 'feed_comment_list_nopunc_lower', 'word_1gram', 1, 50, word_1gram_collection_fromtrain)
            word_ngrams_wrapper(feeds_aug, 'feed_comment_list_nopunc_lower', 'word_1gram', 2, 50, word_2gram_collection_fromtrain)

            # POS n-grams
            POS_tags_ngram_wrapper(feeds_aug, 'feed_comment_list_spacy', 'POS_tag_1gram', 1, 50, POS_tags_1gram_collection_fromtrain)
            POS_tags_ngram_wrapper(feeds_aug, 'feed_comment_list_spacy', 'POS_tag_2gram', 2, 50, POS_tags_2gram_collection_fromtrain)
            #POS_tags_ngram_wrapper(feeds_aug, 'feed_comment_list_spacy', 'POS_tag_3gram', 3, 50, POS_tags_3gram_collection_fromtrain)

        # IMPORTANT: If any features are commented-in above, they must be added to the feature list in this next line
        feeds_aug = feeds_aug[['proficiency', 'comment_length_median', 'letter_prop', 'digit_prop', 'punctuation_prop', 'whitespace_prop', 'word_length_avg', 'word_length_distribution', 'word_short_prop', 'letter_case_distribution', 'word_case_distribution', 'misspelled_prop', 'stop_words_proportion', 'hapax_legomena_prop_tot_tokens', 'hapax_legomena_prop_unique_tokens', 'token_type_ratio', 'letter_1gram', 'letter_2gram', 'letter_3gram', 'letter_4gram', 'digit_1gram', 'punctuation_1gram', 'punctuation_2gram', 'word_1gram', 'word_2gram', 'POS_tag_1gram', 'POS_tag_2gram']]
        for col in feeds_aug.columns:
            if type(feeds_aug[col].iloc[0]) == list:
                newcols = [col + "_" + str(i) for i in range(1,len(feeds_aug[col].iloc[0]) + 1)]
                feeds_aug[newcols] = pd.DataFrame(feeds_aug[col].tolist(), index= feeds_aug.index)
                feeds_aug = feeds_aug.drop([col], axis = 1)
            elif type(feeds_aug[col].iloc[0]) == str and col != 'proficiency':
                feeds_aug[col] = pd.to_numeric(feeds_aug[col], downcast="float")
            
        if stage == "train":
            X_train = feeds_aug
        elif stage == "test":
            X_test = feeds_aug
        
    X_train.to_pickle("dev_" + filetag + "_X_train.pkl")
    X_test.to_pickle("dev_" + filetag + "_X_test.pkl")
    y_train.to_pickle("dev_" + filetag + "_y_train.pkl")
    y_test.to_pickle("dev_" + filetag + "_y_test.pkl")

extract_features(cohort_all, "cohort_all")
extract_features(cohort_native, "cohort_native")
extract_features(cohort_nonnat, "cohort_nonnat")

Performing letter count & proportion...
Performed letter count & proportion in 0.9313466548919678 seconds
Performing digit count & proportion...
Performed digit count & proportion in 0.7103791236877441 seconds
Performing punctuation count & proportion...
Performed punctuation count & proportion in 1.6114833354949951 seconds
Performing whitespace count & proportion...
Performed whitespace count & proportion in 0.6121082305908203 seconds
Performing character count & proportion...
Performed character count & proportion in 0.03362536430358887 seconds
Performing word count...
Performed word count in 0.1242973804473877 seconds
Performing word length avg...
Performed word length avg in 0.1710972785949707 seconds
Performing word length distribution...
Performed word length distribution in 0.27648448944091797 seconds
Performing word count short...
Performed word count short in 0.0027611255645751953 seconds
Performing letter case distribution...
Performed letter case distribution in 0.3502538204

  0%|          | 0/540 [00:00<?, ?it/s]

Performed misspellings proportion in 1195.3415365219116 seconds
Performing stop words ratio...
Performed stop words ratio in 1.826256275177002 seconds
Performing hapax legomena proportion of total tokens...
Performed hapax legomena proportion of total tokens in 1.915389060974121 seconds
Performing hapax legomena proportion of unique tokens...
Performed hapax legomena proportion of unique tokens in 1.815678358078003 seconds
Performing token type ratio...
Performed token type ratio in 2.9435532093048096 seconds
Performing train letter 1-gram...
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train letter 1-gram in 3.4877285957336426 seconds
Returned up to 50 most common letter 1-grams for feature extraction on test set.
Performing train letter 2-gram...
['al', 'an', 'ar', 'as', 'at', 'be', 'ca', 'co', 'de', 'ea', 'ed', 'el', 'en', 'er', 'es', 'ha', 'he', 'hi', 'ho', 'in', 'is', 'it', 'le', 'li', 'll', 'ly', 'me', 'nd', 'ne', 'ng', 'no', 'nt', 'of', 'om', 'on', 'or', 'ot', 'ou', 're', 'se', 'so', 'st', 'te', 'th', 'ti', 'to', 'us', 'ut', 've', 'yo']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train letter 2-gram in 10.574246406555176 seconds
Returned up to 50 most common letter 2-grams for feature extraction on test set.
Performing train letter 3-gram...
['all', 'and', 'are', 'ate', 'ati', 'ave', 'but', 'can', 'com', 'con', 'ear', 'ent', 'ere', 'ers', 'eve', 'for', 'hat', 'hav', 'her', 'hey', 'hin', 'his', 'ike', 'ing', 'ion', 'ith', 'ive', 'lly', 'nce', 'not', 'ome', 'one', 'oul', 'our', 'out', 'ple', 'rea', 'som', 'ter', 'tha', 'the', 'thi', 'tin', 'tio', 'uld', 'use', 'ust', 'ver', 'wit', 'you']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train letter 3-gram in 177.62541508674622 seconds
Returned up to 50 most common letter 3-grams for feature extraction on test set.
Performing train digit 1-gram...
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train digit 1-gram in 2.6545698642730713 seconds
Returned up to 50 most common digit 1-grams for feature extraction on test set.
Performing train digit 2-gram...
['00', '01', '02', '03', '04', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '30', '31', '32', '33', '35', '36', '40', '41', '45', '49', '50', '52', '55', '60', '61', '65', '66', '68', '70', '71', '79', '80', '88', '90', '95', '96', '99']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train digit 2-gram in 4.833482980728149 seconds
Returned up to 50 most common digit 2-grams for feature extraction on test set.
Performing train digit 3-gram...
['000', '001', '008', '013', '014', '016', '018', '019', '020', '021', '022', '045', '090', '100', '112', '120', '121', '122', '124', '125', '130', '150', '156', '166', '168', '180', '195', '199', '200', '201', '202', '212', '220', '241', '250', '255', '300', '330', '400', '415', '500', '600', '613', '647', '680', '700', '716', '800', '804', '900']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train digit 3-gram in 20.73089075088501 seconds
Returned up to 50 most common digit 3-grams for feature extraction on test set.
Performing train punctuation 1-gram...
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '}', '~', '’', '“', '”']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train punctuation 1-gram in 3.4987194538116455 seconds
Returned up to 50 most common punctuation 1-grams for feature extraction on test set.
Performing train punctuation 2-gram...
['!!', '!"', '!)', '")', '",', '".', '"?', '%.', '&#', "',", "'.", '(*', '(+', '(/', '))', '),', ').', ');', '*)', '**', '*,', '*.', ',"', '-)', '--', '."', '.)', '.*', '.,', '..', '.]', '.”', '/)', '//', '/?', '/]', ':(', ':)', ':-', ':/', '?!', '?"', '?)', '??', '\\*', '\\-', '\\[', '\\]', '\\_', '](']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train punctuation 2-gram in 32.301445722579956 seconds
Returned up to 50 most common punctuation 2-grams for feature extraction on test set.
Performing train word 1-gram...
[('a',), ('about',), ('all',), ('an',), ('and',), ('are',), ('as',), ('at',), ('be',), ('but',), ('can',), ('do',), ('dont',), ('for',), ('from',), ('get',), ('have',), ('i',), ('if',), ('im',), ('in',), ('is',), ('it',), ('its',), ('just',), ('like',), ('me',), ('more',), ('my',), ('not',), ('of',), ('on',), ('one',), ('or',), ('people',), ('so',), ('that',), ('the',), ('there',), ('they',), ('think',), ('this',), ('to',), ('was',), ('what',), ('when',), ('with',), ('would',), ('you',), ('your',)]


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train word 1-gram in 69.33535146713257 seconds
Returned up to 50 most common word1-grams for feature extraction on test set.
Performing train POS tags 1-grams...
[('$',), ("''",), (',',), ('-LRB-',), ('-RRB-',), ('.',), (':',), ('ADD',), ('AFX',), ('CC',), ('CD',), ('DT',), ('EX',), ('FW',), ('HYPH',), ('IN',), ('JJ',), ('JJR',), ('JJS',), ('LS',), ('MD',), ('NFP',), ('NN',), ('NNP',), ('NNPS',), ('NNS',), ('PDT',), ('POS',), ('PRP',), ('PRP$',), ('RB',), ('RBR',), ('RBS',), ('RP',), ('SYM',), ('TO',), ('UH',), ('VB',), ('VBD',), ('VBG',), ('VBN',), ('VBP',), ('VBZ',), ('WDT',), ('WP',), ('WP$',), ('WRB',), ('XX',), ('_SP',), ('``',)]


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train POS tags 1-gram in 1.348522424697876 seconds
Returned up to 50 most common POS tags 1-grams for feature extraction on test set.
Performing train POS tags 2-grams...
[(',', 'CC'), (',', 'PRP'), ('.', 'PRP'), ('CC', 'PRP'), ('DT', 'JJ'), ('DT', 'NN'), ('DT', 'NNS'), ('IN', 'DT'), ('IN', 'JJ'), ('IN', 'NN'), ('IN', 'NNP'), ('IN', 'NNS'), ('IN', 'PRP'), ('IN', 'PRP$'), ('IN', 'VBG'), ('JJ', '.'), ('JJ', 'IN'), ('JJ', 'NN'), ('JJ', 'NNS'), ('MD', 'RB'), ('MD', 'VB'), ('NN', ','), ('NN', '.'), ('NN', 'CC'), ('NN', 'IN'), ('NN', 'NN'), ('NN', 'RB'), ('NN', 'VBZ'), ('NNP', 'NNP'), ('NNS', '.'), ('NNS', 'IN'), ('PRP', 'MD'), ('PRP', 'RB'), ('PRP', 'VBD'), ('PRP', 'VBP'), ('PRP', 'VBZ'), ('PRP$', 'NN'), ('RB', '.'), ('RB', 'IN'), ('RB', 'JJ'), ('RB', 'RB'), ('RB', 'VB'), ('TO', 'VB'), ('VB', 'DT'), ('VB', 'IN'), ('VB', 'PRP'), ('VBN', 'IN'), ('VBP', 'RB'), ('VBZ', 'DT'), ('VBZ', 'RB')]


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train POS tags 2-gram in 9.31908917427063 seconds
Returned up to 50 most common POS tags 2-grams for feature extraction on test set.
Performing letter count & proportion...
Performed letter count & proportion in 0.11255812644958496 seconds
Performing digit count & proportion...
Performed digit count & proportion in 0.08694267272949219 seconds
Performing punctuation count & proportion...
Performed punctuation count & proportion in 0.16901087760925293 seconds
Performing whitespace count & proportion...
Performed whitespace count & proportion in 0.06548333168029785 seconds
Performing character count & proportion...
Performed character count & proportion in 0.008797883987426758 seconds
Performing word count...
Performed word count in 0.011553764343261719 seconds
Performing word length avg...
Performed word length avg in 0.018317461013793945 seconds
Performing word length distribution...
Performed word length distribution in 0.029773712158203125 seconds
Performing word count short

  0%|          | 0/60 [00:00<?, ?it/s]

Performed misspellings proportion in 130.78419303894043 seconds
Performing stop words ratio...
Performed stop words ratio in 0.21990680694580078 seconds
Performing hapax legomena proportion of total tokens...
Performed hapax legomena proportion of total tokens in 0.18541932106018066 seconds
Performing hapax legomena proportion of unique tokens...
Performed hapax legomena proportion of unique tokens in 0.22136473655700684 seconds
Performing token type ratio...
Performed token type ratio in 1.2757995128631592 seconds
Performing test letter 1-gram...
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test letter 1-gram in 0.19907712936401367 seconds
Performing test letter 2-gram...
['al', 'an', 'ar', 'as', 'at', 'be', 'ca', 'co', 'de', 'ea', 'ed', 'el', 'en', 'er', 'es', 'ha', 'he', 'hi', 'ho', 'in', 'is', 'it', 'le', 'li', 'll', 'ly', 'me', 'nd', 'ne', 'ng', 'no', 'nt', 'of', 'om', 'on', 'or', 'ot', 'ou', 're', 'se', 'so', 'st', 'te', 'th', 'ti', 'to', 'us', 'ut', 've', 'yo']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test letter 2-gram in 0.2587268352508545 seconds
Performing test letter 3-gram...
['all', 'and', 'are', 'ate', 'ati', 'ave', 'but', 'can', 'com', 'con', 'ear', 'ent', 'ere', 'ers', 'eve', 'for', 'hat', 'hav', 'her', 'hey', 'hin', 'his', 'ike', 'ing', 'ion', 'ith', 'ive', 'lly', 'nce', 'not', 'ome', 'one', 'oul', 'our', 'out', 'ple', 'rea', 'som', 'ter', 'tha', 'the', 'thi', 'tin', 'tio', 'uld', 'use', 'ust', 'ver', 'wit', 'you']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test letter 3-gram in 0.22792768478393555 seconds
Performing test digit 1-gram...
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test digit 1-gram in 0.26302099227905273 seconds
Performing test digit 2-gram...
['00', '01', '02', '03', '04', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '30', '31', '32', '33', '35', '36', '40', '41', '45', '49', '50', '52', '55', '60', '61', '65', '66', '68', '70', '71', '79', '80', '88', '90', '95', '96', '99']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test digit 2-gram in 0.22809767723083496 seconds
Performing test digit 3-gram...
['000', '001', '008', '013', '014', '016', '018', '019', '020', '021', '022', '045', '090', '100', '112', '120', '121', '122', '124', '125', '130', '150', '156', '166', '168', '180', '195', '199', '200', '201', '202', '212', '220', '241', '250', '255', '300', '330', '400', '415', '500', '600', '613', '647', '680', '700', '716', '800', '804', '900']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test digit 3-gram in 0.20425820350646973 seconds
Performing test punctuation 1-gram...
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '}', '~', '’', '“', '”']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test punctuation 1-gram in 0.2701423168182373 seconds
Performing test punctuation 2-gram...
['!!', '!"', '!)', '")', '",', '".', '"?', '%.', '&#', "',", "'.", '(*', '(+', '(/', '))', '),', ').', ');', '*)', '**', '*,', '*.', ',"', '-)', '--', '."', '.)', '.*', '.,', '..', '.]', '.”', '/)', '//', '/?', '/]', ':(', ':)', ':-', ':/', '?!', '?"', '?)', '??', '\\*', '\\-', '\\[', '\\]', '\\_', '](']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test punctuation 2-gram in 0.22701215744018555 seconds
Performing test word 1-gram...
[('a',), ('about',), ('all',), ('an',), ('and',), ('are',), ('as',), ('at',), ('be',), ('but',), ('can',), ('do',), ('dont',), ('for',), ('from',), ('get',), ('have',), ('i',), ('if',), ('im',), ('in',), ('is',), ('it',), ('its',), ('just',), ('like',), ('me',), ('more',), ('my',), ('not',), ('of',), ('on',), ('one',), ('or',), ('people',), ('so',), ('that',), ('the',), ('there',), ('they',), ('think',), ('this',), ('to',), ('was',), ('what',), ('when',), ('with',), ('would',), ('you',), ('your',)]


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test word 1-gram in 0.27701401710510254 seconds
Performing test POS tags 1-gram...
[('$',), ("''",), (',',), ('-LRB-',), ('-RRB-',), ('.',), (':',), ('ADD',), ('AFX',), ('CC',), ('CD',), ('DT',), ('EX',), ('FW',), ('HYPH',), ('IN',), ('JJ',), ('JJR',), ('JJS',), ('LS',), ('MD',), ('NFP',), ('NN',), ('NNP',), ('NNPS',), ('NNS',), ('PDT',), ('POS',), ('PRP',), ('PRP$',), ('RB',), ('RBR',), ('RBS',), ('RP',), ('SYM',), ('TO',), ('UH',), ('VB',), ('VBD',), ('VBG',), ('VBN',), ('VBP',), ('VBZ',), ('WDT',), ('WP',), ('WP$',), ('WRB',), ('XX',), ('_SP',), ('``',)]


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test POS tags 1-gram in 0.18973064422607422 seconds
Performing test POS tags 2-gram...
[(',', 'CC'), (',', 'PRP'), ('.', 'PRP'), ('CC', 'PRP'), ('DT', 'JJ'), ('DT', 'NN'), ('DT', 'NNS'), ('IN', 'DT'), ('IN', 'JJ'), ('IN', 'NN'), ('IN', 'NNP'), ('IN', 'NNS'), ('IN', 'PRP'), ('IN', 'PRP$'), ('IN', 'VBG'), ('JJ', '.'), ('JJ', 'IN'), ('JJ', 'NN'), ('JJ', 'NNS'), ('MD', 'RB'), ('MD', 'VB'), ('NN', ','), ('NN', '.'), ('NN', 'CC'), ('NN', 'IN'), ('NN', 'NN'), ('NN', 'RB'), ('NN', 'VBZ'), ('NNP', 'NNP'), ('NNS', '.'), ('NNS', 'IN'), ('PRP', 'MD'), ('PRP', 'RB'), ('PRP', 'VBD'), ('PRP', 'VBP'), ('PRP', 'VBZ'), ('PRP$', 'NN'), ('RB', '.'), ('RB', 'IN'), ('RB', 'JJ'), ('RB', 'RB'), ('RB', 'VB'), ('TO', 'VB'), ('VB', 'DT'), ('VB', 'IN'), ('VB', 'PRP'), ('VBN', 'IN'), ('VBP', 'RB'), ('VBZ', 'DT'), ('VBZ', 'RB')]


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test POS tags 2-gram in 0.3223552703857422 seconds
Performing letter count & proportion...
Performed letter count & proportion in 0.8379025459289551 seconds
Performing digit count & proportion...
Performed digit count & proportion in 0.7659454345703125 seconds
Performing punctuation count & proportion...
Performed punctuation count & proportion in 1.5017914772033691 seconds
Performing whitespace count & proportion...
Performed whitespace count & proportion in 0.5957372188568115 seconds
Performing character count & proportion...
Performed character count & proportion in 0.023203372955322266 seconds
Performing word count...
Performed word count in 0.10275959968566895 seconds
Performing word length avg...
Performed word length avg in 0.12980198860168457 seconds
Performing word length distribution...
Performed word length distribution in 0.26637887954711914 seconds
Performing word count short...
Performed word count short in 0.0039446353912353516 seconds
Performing letter case di

  0%|          | 0/540 [00:00<?, ?it/s]

Performed misspellings proportion in 1100.2819290161133 seconds
Performing stop words ratio...
Performed stop words ratio in 1.966287612915039 seconds
Performing hapax legomena proportion of total tokens...
Performed hapax legomena proportion of total tokens in 1.745077133178711 seconds
Performing hapax legomena proportion of unique tokens...
Performed hapax legomena proportion of unique tokens in 1.9414589405059814 seconds
Performing token type ratio...
Performed token type ratio in 2.955264091491699 seconds
Performing train letter 1-gram...
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train letter 1-gram in 3.518671989440918 seconds
Returned up to 50 most common letter 1-grams for feature extraction on test set.
Performing train letter 2-gram...
['al', 'an', 'ar', 'as', 'at', 'be', 'ca', 'co', 'de', 'ea', 'ed', 'en', 'er', 'es', 'ha', 'he', 'hi', 'ho', 'in', 'is', 'it', 'le', 'li', 'll', 'ly', 'me', 'nd', 'ne', 'ng', 'no', 'nt', 'of', 'om', 'on', 'or', 'ot', 'ou', 're', 'ri', 'se', 'so', 'st', 'te', 'th', 'ti', 'to', 'us', 'ut', 've', 'yo']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train letter 2-gram in 10.98355507850647 seconds
Returned up to 50 most common letter 2-grams for feature extraction on test set.
Performing train letter 3-gram...
['all', 'and', 'are', 'ate', 'ati', 'ave', 'but', 'can', 'com', 'con', 'ear', 'ent', 'ere', 'ers', 'eve', 'for', 'hat', 'hav', 'her', 'hey', 'hin', 'his', 'ike', 'ing', 'ion', 'ith', 'ive', 'lik', 'lly', 'not', 'ome', 'one', 'ore', 'oul', 'our', 'out', 'ple', 'rea', 'som', 'ter', 'tha', 'the', 'thi', 'tin', 'tio', 'uld', 'use', 'ver', 'wit', 'you']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train letter 3-gram in 182.03345775604248 seconds
Returned up to 50 most common letter 3-grams for feature extraction on test set.
Performing train digit 1-gram...
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train digit 1-gram in 2.52227520942688 seconds
Returned up to 50 most common digit 1-grams for feature extraction on test set.
Performing train digit 2-gram...
['00', '01', '02', '03', '04', '05', '06', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '30', '31', '32', '33', '40', '42', '45', '49', '50', '52', '60', '65', '70', '72', '75', '79', '80', '90', '95', '96', '97', '98', '99']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train digit 2-gram in 4.991602897644043 seconds
Returned up to 50 most common digit 2-grams for feature extraction on test set.
Performing train digit 3-gram...
['000', '001', '006', '010', '013', '014', '015', '016', '017', '018', '019', '020', '021', '022', '045', '080', '090', '100', '112', '120', '125', '147', '149', '150', '168', '180', '190', '195', '197', '198', '199', '200', '201', '202', '250', '300', '320', '400', '471', '500', '512', '600', '680', '700', '716', '731', '800', '804', '900', '979']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train digit 3-gram in 20.148253202438354 seconds
Returned up to 50 most common digit 3-grams for feature extraction on test set.
Performing train punctuation 1-gram...
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '’', '“', '”']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train punctuation 1-gram in 4.43937349319458 seconds
Returned up to 50 most common punctuation 1-grams for feature extraction on test set.
Performing train punctuation 2-gram...
['!!', '!"', '!)', '!<', '!?', '!”', '")', '",', '".', '"?', '%.', "'.", '(*', '(/', '))', '),', ').', '*)', '**', '*,', '*.', ',"', ',”', '-)', '--', '->', '."', '.)', '.*', '.,', '..', '.]', '.”', '/)', '//', '/?', '/]', ':)', ':-', ':/', '?!', '?"', '?)', '??', '?”', '\\*', '\\-', '\\_', '](', '~~']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train punctuation 2-gram in 32.49608492851257 seconds
Returned up to 50 most common punctuation 2-grams for feature extraction on test set.
Performing train word 1-gram...
[('a',), ('about',), ('all',), ('an',), ('and',), ('are',), ('as',), ('at',), ('be',), ('but',), ('can',), ('do',), ('dont',), ('for',), ('from',), ('get',), ('have',), ('i',), ('if',), ('im',), ('in',), ('is',), ('it',), ('its',), ('just',), ('like',), ('me',), ('more',), ('my',), ('not',), ('of',), ('on',), ('one',), ('or',), ('out',), ('people',), ('so',), ('some',), ('that',), ('the',), ('they',), ('this',), ('to',), ('was',), ('we',), ('what',), ('with',), ('would',), ('you',), ('your',)]


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train word 1-gram in 64.47618269920349 seconds
Returned up to 50 most common word1-grams for feature extraction on test set.
Performing train POS tags 1-grams...
[('$',), ("''",), (',',), ('-LRB-',), ('-RRB-',), ('.',), (':',), ('ADD',), ('AFX',), ('CC',), ('CD',), ('DT',), ('EX',), ('FW',), ('HYPH',), ('IN',), ('JJ',), ('JJR',), ('JJS',), ('LS',), ('MD',), ('NFP',), ('NN',), ('NNP',), ('NNPS',), ('NNS',), ('PDT',), ('POS',), ('PRP',), ('PRP$',), ('RB',), ('RBR',), ('RBS',), ('RP',), ('SYM',), ('TO',), ('UH',), ('VB',), ('VBD',), ('VBG',), ('VBN',), ('VBP',), ('VBZ',), ('WDT',), ('WP',), ('WP$',), ('WRB',), ('XX',), ('_SP',), ('``',)]


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train POS tags 1-gram in 1.3292150497436523 seconds
Returned up to 50 most common POS tags 1-grams for feature extraction on test set.
Performing train POS tags 2-grams...
[(',', 'CC'), (',', 'PRP'), ('.', 'PRP'), ('.', 'RB'), ('CC', 'PRP'), ('DT', 'JJ'), ('DT', 'NN'), ('DT', 'NNS'), ('IN', 'DT'), ('IN', 'JJ'), ('IN', 'NN'), ('IN', 'NNP'), ('IN', 'NNS'), ('IN', 'PRP'), ('IN', 'PRP$'), ('JJ', '.'), ('JJ', 'IN'), ('JJ', 'NN'), ('JJ', 'NNS'), ('MD', 'VB'), ('NN', ','), ('NN', '.'), ('NN', 'CC'), ('NN', 'IN'), ('NN', 'NN'), ('NN', 'NNS'), ('NN', 'VBZ'), ('NNP', 'NNP'), ('NNS', '.'), ('NNS', 'IN'), ('PRP', 'MD'), ('PRP', 'RB'), ('PRP', 'VBD'), ('PRP', 'VBP'), ('PRP', 'VBZ'), ('PRP$', 'NN'), ('RB', '.'), ('RB', 'IN'), ('RB', 'JJ'), ('RB', 'RB'), ('RB', 'VB'), ('TO', 'VB'), ('VB', 'DT'), ('VB', 'IN'), ('VB', 'PRP'), ('VBN', 'IN'), ('VBP', 'DT'), ('VBP', 'RB'), ('VBZ', 'DT'), ('VBZ', 'RB')]


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train POS tags 2-gram in 8.291099071502686 seconds
Returned up to 50 most common POS tags 2-grams for feature extraction on test set.
Performing letter count & proportion...
Performed letter count & proportion in 0.10777640342712402 seconds
Performing digit count & proportion...
Performed digit count & proportion in 0.07855224609375 seconds
Performing punctuation count & proportion...
Performed punctuation count & proportion in 0.19692683219909668 seconds
Performing whitespace count & proportion...
Performed whitespace count & proportion in 0.05801582336425781 seconds
Performing character count & proportion...
Performed character count & proportion in 0.006568431854248047 seconds
Performing word count...
Performed word count in 0.012842416763305664 seconds
Performing word length avg...
Performed word length avg in 0.015816688537597656 seconds
Performing word length distribution...
Performed word length distribution in 0.024096965789794922 seconds
Performing word count short..

  0%|          | 0/60 [00:00<?, ?it/s]

Performed misspellings proportion in 110.7400336265564 seconds
Performing stop words ratio...
Performed stop words ratio in 0.23448944091796875 seconds
Performing hapax legomena proportion of total tokens...
Performed hapax legomena proportion of total tokens in 0.24635744094848633 seconds
Performing hapax legomena proportion of unique tokens...
Performed hapax legomena proportion of unique tokens in 0.22994637489318848 seconds
Performing token type ratio...
Performed token type ratio in 1.3524479866027832 seconds
Performing test letter 1-gram...
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test letter 1-gram in 0.23977947235107422 seconds
Performing test letter 2-gram...
['al', 'an', 'ar', 'as', 'at', 'be', 'ca', 'co', 'de', 'ea', 'ed', 'en', 'er', 'es', 'ha', 'he', 'hi', 'ho', 'in', 'is', 'it', 'le', 'li', 'll', 'ly', 'me', 'nd', 'ne', 'ng', 'no', 'nt', 'of', 'om', 'on', 'or', 'ot', 'ou', 're', 'ri', 'se', 'so', 'st', 'te', 'th', 'ti', 'to', 'us', 'ut', 've', 'yo']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test letter 2-gram in 0.24193811416625977 seconds
Performing test letter 3-gram...
['all', 'and', 'are', 'ate', 'ati', 'ave', 'but', 'can', 'com', 'con', 'ear', 'ent', 'ere', 'ers', 'eve', 'for', 'hat', 'hav', 'her', 'hey', 'hin', 'his', 'ike', 'ing', 'ion', 'ith', 'ive', 'lik', 'lly', 'not', 'ome', 'one', 'ore', 'oul', 'our', 'out', 'ple', 'rea', 'som', 'ter', 'tha', 'the', 'thi', 'tin', 'tio', 'uld', 'use', 'ver', 'wit', 'you']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test letter 3-gram in 0.1956470012664795 seconds
Performing test digit 1-gram...
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test digit 1-gram in 0.26711606979370117 seconds
Performing test digit 2-gram...
['00', '01', '02', '03', '04', '05', '06', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '30', '31', '32', '33', '40', '42', '45', '49', '50', '52', '60', '65', '70', '72', '75', '79', '80', '90', '95', '96', '97', '98', '99']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test digit 2-gram in 0.23191046714782715 seconds
Performing test digit 3-gram...
['000', '001', '006', '010', '013', '014', '015', '016', '017', '018', '019', '020', '021', '022', '045', '080', '090', '100', '112', '120', '125', '147', '149', '150', '168', '180', '190', '195', '197', '198', '199', '200', '201', '202', '250', '300', '320', '400', '471', '500', '512', '600', '680', '700', '716', '731', '800', '804', '900', '979']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test digit 3-gram in 0.21552014350891113 seconds
Performing test punctuation 1-gram...
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '’', '“', '”']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test punctuation 1-gram in 0.2709927558898926 seconds
Performing test punctuation 2-gram...
['!!', '!"', '!)', '!<', '!?', '!”', '")', '",', '".', '"?', '%.', "'.", '(*', '(/', '))', '),', ').', '*)', '**', '*,', '*.', ',"', ',”', '-)', '--', '->', '."', '.)', '.*', '.,', '..', '.]', '.”', '/)', '//', '/?', '/]', ':)', ':-', ':/', '?!', '?"', '?)', '??', '?”', '\\*', '\\-', '\\_', '](', '~~']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test punctuation 2-gram in 0.23576927185058594 seconds
Performing test word 1-gram...
[('a',), ('about',), ('all',), ('an',), ('and',), ('are',), ('as',), ('at',), ('be',), ('but',), ('can',), ('do',), ('dont',), ('for',), ('from',), ('get',), ('have',), ('i',), ('if',), ('im',), ('in',), ('is',), ('it',), ('its',), ('just',), ('like',), ('me',), ('more',), ('my',), ('not',), ('of',), ('on',), ('one',), ('or',), ('out',), ('people',), ('so',), ('some',), ('that',), ('the',), ('they',), ('this',), ('to',), ('was',), ('we',), ('what',), ('with',), ('would',), ('you',), ('your',)]


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test word 1-gram in 0.2945990562438965 seconds
Performing test POS tags 1-gram...
[('$',), ("''",), (',',), ('-LRB-',), ('-RRB-',), ('.',), (':',), ('ADD',), ('AFX',), ('CC',), ('CD',), ('DT',), ('EX',), ('FW',), ('HYPH',), ('IN',), ('JJ',), ('JJR',), ('JJS',), ('LS',), ('MD',), ('NFP',), ('NN',), ('NNP',), ('NNPS',), ('NNS',), ('PDT',), ('POS',), ('PRP',), ('PRP$',), ('RB',), ('RBR',), ('RBS',), ('RP',), ('SYM',), ('TO',), ('UH',), ('VB',), ('VBD',), ('VBG',), ('VBN',), ('VBP',), ('VBZ',), ('WDT',), ('WP',), ('WP$',), ('WRB',), ('XX',), ('_SP',), ('``',)]


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test POS tags 1-gram in 0.24094891548156738 seconds
Performing test POS tags 2-gram...
[(',', 'CC'), (',', 'PRP'), ('.', 'PRP'), ('.', 'RB'), ('CC', 'PRP'), ('DT', 'JJ'), ('DT', 'NN'), ('DT', 'NNS'), ('IN', 'DT'), ('IN', 'JJ'), ('IN', 'NN'), ('IN', 'NNP'), ('IN', 'NNS'), ('IN', 'PRP'), ('IN', 'PRP$'), ('JJ', '.'), ('JJ', 'IN'), ('JJ', 'NN'), ('JJ', 'NNS'), ('MD', 'VB'), ('NN', ','), ('NN', '.'), ('NN', 'CC'), ('NN', 'IN'), ('NN', 'NN'), ('NN', 'NNS'), ('NN', 'VBZ'), ('NNP', 'NNP'), ('NNS', '.'), ('NNS', 'IN'), ('PRP', 'MD'), ('PRP', 'RB'), ('PRP', 'VBD'), ('PRP', 'VBP'), ('PRP', 'VBZ'), ('PRP$', 'NN'), ('RB', '.'), ('RB', 'IN'), ('RB', 'JJ'), ('RB', 'RB'), ('RB', 'VB'), ('TO', 'VB'), ('VB', 'DT'), ('VB', 'IN'), ('VB', 'PRP'), ('VBN', 'IN'), ('VBP', 'DT'), ('VBP', 'RB'), ('VBZ', 'DT'), ('VBZ', 'RB')]


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test POS tags 2-gram in 0.3113884925842285 seconds
Performing letter count & proportion...
Performed letter count & proportion in 0.9183580875396729 seconds
Performing digit count & proportion...
Performed digit count & proportion in 0.6570332050323486 seconds
Performing punctuation count & proportion...
Performed punctuation count & proportion in 1.5225162506103516 seconds
Performing whitespace count & proportion...
Performed whitespace count & proportion in 0.5180864334106445 seconds
Performing character count & proportion...
Performed character count & proportion in 0.026520252227783203 seconds
Performing word count...
Performed word count in 0.09534502029418945 seconds
Performing word length avg...
Performed word length avg in 0.1456148624420166 seconds
Performing word length distribution...
Performed word length distribution in 0.26849365234375 seconds
Performing word count short...
Performed word count short in 0.0021338462829589844 seconds
Performing letter case distri

  0%|          | 0/540 [00:00<?, ?it/s]

Performed misspellings proportion in 1091.658107995987 seconds
Performing stop words ratio...
Performed stop words ratio in 1.8346738815307617 seconds
Performing hapax legomena proportion of total tokens...
Performed hapax legomena proportion of total tokens in 1.6661875247955322 seconds
Performing hapax legomena proportion of unique tokens...
Performed hapax legomena proportion of unique tokens in 1.8396039009094238 seconds
Performing token type ratio...
Performed token type ratio in 2.887749671936035 seconds
Performing train letter 1-gram...
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train letter 1-gram in 3.389281988143921 seconds
Returned up to 50 most common letter 1-grams for feature extraction on test set.
Performing train letter 2-gram...
['al', 'an', 'ar', 'as', 'at', 'be', 'ca', 'co', 'de', 'ea', 'ed', 'el', 'en', 'er', 'es', 'ha', 'he', 'hi', 'ho', 'in', 'is', 'it', 'le', 'li', 'll', 'ly', 'me', 'nd', 'ne', 'ng', 'no', 'nt', 'of', 'om', 'on', 'or', 'ot', 'ou', 're', 'se', 'so', 'st', 'te', 'th', 'ti', 'to', 'us', 'ut', 've', 'yo']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train letter 2-gram in 10.321114540100098 seconds
Returned up to 50 most common letter 2-grams for feature extraction on test set.
Performing train letter 3-gram...
['all', 'and', 'are', 'ate', 'ati', 'ave', 'but', 'can', 'com', 'ear', 'ent', 'ere', 'ers', 'eve', 'for', 'hat', 'hav', 'her', 'hey', 'hin', 'his', 'ike', 'ing', 'ion', 'ith', 'ive', 'jus', 'lik', 'lly', 'nce', 'not', 'ome', 'one', 'oul', 'our', 'out', 'ple', 'rea', 'som', 'ter', 'tha', 'the', 'thi', 'tio', 'uld', 'use', 'ust', 'ver', 'wit', 'you']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train letter 3-gram in 171.58079552650452 seconds
Returned up to 50 most common letter 3-grams for feature extraction on test set.
Performing train digit 1-gram...
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train digit 1-gram in 2.673332691192627 seconds
Returned up to 50 most common digit 1-grams for feature extraction on test set.
Performing train digit 2-gram...
['00', '01', '02', '03', '07', '08', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '27', '28', '30', '31', '32', '33', '35', '36', '38', '40', '41', '44', '45', '50', '55', '60', '65', '66', '68', '70', '72', '75', '77', '80', '90', '93', '96', '99']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train digit 2-gram in 4.678945779800415 seconds
Returned up to 50 most common digit 2-grams for feature extraction on test set.
Performing train digit 3-gram...
['000', '001', '008', '010', '013', '016', '019', '020', '021', '022', '096', '100', '101', '110', '120', '121', '122', '124', '130', '144', '150', '155', '156', '160', '166', '169', '180', '190', '200', '201', '202', '209', '212', '241', '255', '300', '364', '400', '415', '444', '477', '500', '550', '600', '613', '620', '647', '700', '800', '900']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train digit 3-gram in 21.024398803710938 seconds
Returned up to 50 most common digit 3-grams for feature extraction on test set.
Performing train punctuation 1-gram...
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '’', '“', '”']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train punctuation 1-gram in 3.2264230251312256 seconds
Returned up to 50 most common punctuation 1-grams for feature extraction on test set.
Performing train punctuation 2-gram...
['!!', '!"', '!)', '!<', '!”', '")', '"*', '",', '".', '"?', '%.', '&#', "',", "'.", '))', '),', ').', ');', '*"', '*)', '**', '*,', '*.', ',"', '."', '.)', '.*', '..', '.]', '.”', '/)', '//', '/?', ':(', ':)', ':/', '>!', '?!', '?"', '?)', '??', '\\*', '\\-', '\\[', '\\]', '\\_', '](', '__', '”,', '”.']


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train punctuation 2-gram in 31.60771608352661 seconds
Returned up to 50 most common punctuation 2-grams for feature extraction on test set.
Performing train word 1-gram...
[('a',), ('about',), ('all',), ('an',), ('and',), ('are',), ('as',), ('at',), ('be',), ('because',), ('but',), ('can',), ('do',), ('dont',), ('for',), ('from',), ('have',), ('he',), ('i',), ('if',), ('im',), ('in',), ('is',), ('it',), ('its',), ('just',), ('like',), ('me',), ('more',), ('my',), ('not',), ('of',), ('on',), ('one',), ('or',), ('people',), ('so',), ('that',), ('the',), ('they',), ('think',), ('this',), ('to',), ('was',), ('what',), ('when',), ('with',), ('would',), ('you',), ('your',)]


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train word 1-gram in 78.48701810836792 seconds
Returned up to 50 most common word1-grams for feature extraction on test set.
Performing train POS tags 1-grams...
[('$',), ("''",), (',',), ('-LRB-',), ('-RRB-',), ('.',), (':',), ('ADD',), ('AFX',), ('CC',), ('CD',), ('DT',), ('EX',), ('FW',), ('HYPH',), ('IN',), ('JJ',), ('JJR',), ('JJS',), ('LS',), ('MD',), ('NFP',), ('NN',), ('NNP',), ('NNPS',), ('NNS',), ('PDT',), ('POS',), ('PRP',), ('PRP$',), ('RB',), ('RBR',), ('RBS',), ('RP',), ('SYM',), ('TO',), ('UH',), ('VB',), ('VBD',), ('VBG',), ('VBN',), ('VBP',), ('VBZ',), ('WDT',), ('WP',), ('WP$',), ('WRB',), ('XX',), ('_SP',), ('``',)]


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train POS tags 1-gram in 1.565347671508789 seconds
Returned up to 50 most common POS tags 1-grams for feature extraction on test set.
Performing train POS tags 2-grams...
[(',', 'CC'), (',', 'PRP'), ('.', 'PRP'), ('CC', 'PRP'), ('DT', 'JJ'), ('DT', 'NN'), ('DT', 'NNS'), ('IN', 'DT'), ('IN', 'JJ'), ('IN', 'NN'), ('IN', 'NNP'), ('IN', 'NNS'), ('IN', 'PRP'), ('IN', 'PRP$'), ('JJ', '.'), ('JJ', 'IN'), ('JJ', 'NN'), ('JJ', 'NNS'), ('MD', 'RB'), ('MD', 'VB'), ('NN', ','), ('NN', '.'), ('NN', 'CC'), ('NN', 'IN'), ('NN', 'NN'), ('NN', 'RB'), ('NN', 'VBZ'), ('NNP', 'NNP'), ('NNS', '.'), ('NNS', 'IN'), ('PRP', 'MD'), ('PRP', 'RB'), ('PRP', 'VBD'), ('PRP', 'VBP'), ('PRP', 'VBZ'), ('PRP$', 'NN'), ('RB', '.'), ('RB', 'IN'), ('RB', 'JJ'), ('RB', 'RB'), ('RB', 'VB'), ('TO', 'VB'), ('VB', 'DT'), ('VB', 'IN'), ('VB', 'PRP'), ('VBN', 'IN'), ('VBP', 'DT'), ('VBP', 'RB'), ('VBZ', 'DT'), ('VBZ', 'RB')]


  0%|          | 0/540 [00:00<?, ?it/s]

Performed train POS tags 2-gram in 9.770082473754883 seconds
Returned up to 50 most common POS tags 2-grams for feature extraction on test set.
Performing letter count & proportion...
Performed letter count & proportion in 0.12450480461120605 seconds
Performing digit count & proportion...
Performed digit count & proportion in 0.07952022552490234 seconds
Performing punctuation count & proportion...
Performed punctuation count & proportion in 0.15365338325500488 seconds
Performing whitespace count & proportion...
Performed whitespace count & proportion in 0.05900430679321289 seconds
Performing character count & proportion...
Performed character count & proportion in 0.006491661071777344 seconds
Performing word count...
Performed word count in 0.010852575302124023 seconds
Performing word length avg...
Performed word length avg in 0.01452183723449707 seconds
Performing word length distribution...
Performed word length distribution in 0.028422832489013672 seconds
Performing word count short

  0%|          | 0/60 [00:00<?, ?it/s]

Performed misspellings proportion in 144.77538108825684 seconds
Performing stop words ratio...
Performed stop words ratio in 0.2167818546295166 seconds
Performing hapax legomena proportion of total tokens...
Performed hapax legomena proportion of total tokens in 0.20307278633117676 seconds
Performing hapax legomena proportion of unique tokens...
Performed hapax legomena proportion of unique tokens in 0.17252731323242188 seconds
Performing token type ratio...
Performed token type ratio in 1.1249175071716309 seconds
Performing test letter 1-gram...
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test letter 1-gram in 0.2111964225769043 seconds
Performing test letter 2-gram...
['al', 'an', 'ar', 'as', 'at', 'be', 'ca', 'co', 'de', 'ea', 'ed', 'el', 'en', 'er', 'es', 'ha', 'he', 'hi', 'ho', 'in', 'is', 'it', 'le', 'li', 'll', 'ly', 'me', 'nd', 'ne', 'ng', 'no', 'nt', 'of', 'om', 'on', 'or', 'ot', 'ou', 're', 'se', 'so', 'st', 'te', 'th', 'ti', 'to', 'us', 'ut', 've', 'yo']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test letter 2-gram in 0.23292851448059082 seconds
Performing test letter 3-gram...
['all', 'and', 'are', 'ate', 'ati', 'ave', 'but', 'can', 'com', 'ear', 'ent', 'ere', 'ers', 'eve', 'for', 'hat', 'hav', 'her', 'hey', 'hin', 'his', 'ike', 'ing', 'ion', 'ith', 'ive', 'jus', 'lik', 'lly', 'nce', 'not', 'ome', 'one', 'oul', 'our', 'out', 'ple', 'rea', 'som', 'ter', 'tha', 'the', 'thi', 'tio', 'uld', 'use', 'ust', 'ver', 'wit', 'you']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test letter 3-gram in 0.20315051078796387 seconds
Performing test digit 1-gram...
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test digit 1-gram in 0.2618598937988281 seconds
Performing test digit 2-gram...
['00', '01', '02', '03', '07', '08', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '27', '28', '30', '31', '32', '33', '35', '36', '38', '40', '41', '44', '45', '50', '55', '60', '65', '66', '68', '70', '72', '75', '77', '80', '90', '93', '96', '99']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test digit 2-gram in 0.28728270530700684 seconds
Performing test digit 3-gram...
['000', '001', '008', '010', '013', '016', '019', '020', '021', '022', '096', '100', '101', '110', '120', '121', '122', '124', '130', '144', '150', '155', '156', '160', '166', '169', '180', '190', '200', '201', '202', '209', '212', '241', '255', '300', '364', '400', '415', '444', '477', '500', '550', '600', '613', '620', '647', '700', '800', '900']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test digit 3-gram in 0.20129919052124023 seconds
Performing test punctuation 1-gram...
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '’', '“', '”']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test punctuation 1-gram in 0.23924636840820312 seconds
Performing test punctuation 2-gram...
['!!', '!"', '!)', '!<', '!”', '")', '"*', '",', '".', '"?', '%.', '&#', "',", "'.", '))', '),', ').', ');', '*"', '*)', '**', '*,', '*.', ',"', '."', '.)', '.*', '..', '.]', '.”', '/)', '//', '/?', ':(', ':)', ':/', '>!', '?!', '?"', '?)', '??', '\\*', '\\-', '\\[', '\\]', '\\_', '](', '__', '”,', '”.']


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test punctuation 2-gram in 0.21889901161193848 seconds
Performing test word 1-gram...
[('a',), ('about',), ('all',), ('an',), ('and',), ('are',), ('as',), ('at',), ('be',), ('because',), ('but',), ('can',), ('do',), ('dont',), ('for',), ('from',), ('have',), ('he',), ('i',), ('if',), ('im',), ('in',), ('is',), ('it',), ('its',), ('just',), ('like',), ('me',), ('more',), ('my',), ('not',), ('of',), ('on',), ('one',), ('or',), ('people',), ('so',), ('that',), ('the',), ('they',), ('think',), ('this',), ('to',), ('was',), ('what',), ('when',), ('with',), ('would',), ('you',), ('your',)]


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test word 1-gram in 0.26030802726745605 seconds
Performing test POS tags 1-gram...
[('$',), ("''",), (',',), ('-LRB-',), ('-RRB-',), ('.',), (':',), ('ADD',), ('AFX',), ('CC',), ('CD',), ('DT',), ('EX',), ('FW',), ('HYPH',), ('IN',), ('JJ',), ('JJR',), ('JJS',), ('LS',), ('MD',), ('NFP',), ('NN',), ('NNP',), ('NNPS',), ('NNS',), ('PDT',), ('POS',), ('PRP',), ('PRP$',), ('RB',), ('RBR',), ('RBS',), ('RP',), ('SYM',), ('TO',), ('UH',), ('VB',), ('VBD',), ('VBG',), ('VBN',), ('VBP',), ('VBZ',), ('WDT',), ('WP',), ('WP$',), ('WRB',), ('XX',), ('_SP',), ('``',)]


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test POS tags 1-gram in 0.20082640647888184 seconds
Performing test POS tags 2-gram...
[(',', 'CC'), (',', 'PRP'), ('.', 'PRP'), ('CC', 'PRP'), ('DT', 'JJ'), ('DT', 'NN'), ('DT', 'NNS'), ('IN', 'DT'), ('IN', 'JJ'), ('IN', 'NN'), ('IN', 'NNP'), ('IN', 'NNS'), ('IN', 'PRP'), ('IN', 'PRP$'), ('JJ', '.'), ('JJ', 'IN'), ('JJ', 'NN'), ('JJ', 'NNS'), ('MD', 'RB'), ('MD', 'VB'), ('NN', ','), ('NN', '.'), ('NN', 'CC'), ('NN', 'IN'), ('NN', 'NN'), ('NN', 'RB'), ('NN', 'VBZ'), ('NNP', 'NNP'), ('NNS', '.'), ('NNS', 'IN'), ('PRP', 'MD'), ('PRP', 'RB'), ('PRP', 'VBD'), ('PRP', 'VBP'), ('PRP', 'VBZ'), ('PRP$', 'NN'), ('RB', '.'), ('RB', 'IN'), ('RB', 'JJ'), ('RB', 'RB'), ('RB', 'VB'), ('TO', 'VB'), ('VB', 'DT'), ('VB', 'IN'), ('VB', 'PRP'), ('VBN', 'IN'), ('VBP', 'DT'), ('VBP', 'RB'), ('VBZ', 'DT'), ('VBZ', 'RB')]


  0%|          | 0/60 [00:00<?, ?it/s]

Performed test POS tags 2-gram in 0.32909536361694336 seconds


# Train Models

In [None]:
# https://www.baeldung.com/cs/svm-multiclass-classification

In [None]:
def classify(filetag, kernel):
    # Import train and test data w features. Split out proficiency.
    X_train = pd.read_pickle('dev_' + filetag + '_X_train.pkl')
    train_proficiency = X_train['proficiency']
    X_train = X_train.drop(['proficiency'], axis = 1)

    X_test = pd.read_pickle('dev_' + filetag + '_X_test.pkl')
    test_proficiency = X_test['proficiency']
    X_test = X_test.drop(['proficiency'], axis = 1)
    
    y_train = pd.read_pickle('dev_' + filetag + '_y_train.pkl')
    y_test = pd.read_pickle('dev_' + filetag + '_y_test.pkl')

    # Train SVM models - separate if statements because gamma and C can differ. These numbers simply taken from article; could be tuned.
    if kernel == "rbf":
        model = svm.SVC(kernel='rbf', gamma=0.5, C=0.1).fit(X_train, y_train['author_id'])
    if kernel == "poly":
        model = svm.SVC(kernel='poly', degree=3, C=1).fit(X_train, y_train['author_id'])
    if kernel == "linear":
        model = svm.SVC(kernel='linear', degree=3, C=1).fit(X_train, y_train['author_id'])
    
    # Split test into native and nonnative feeds (can handle all-native or all-non-native set)
    X_test_all_proficiencies = X_test.merge(test_proficiency, left_index=True, right_index=True, how='inner')
    
    X_test_native = X_test_all_proficiencies.loc[X_test_all_proficiencies['proficiency'] == "N"]
    y_test_native = y_test['author_id'].loc[X_test_all_proficiencies['proficiency'] == "N"]
    X_test_native = X_test_native.drop(['proficiency'], axis = 1)
    
    X_test_nonnat = X_test_all_proficiencies.loc[X_test_all_proficiencies['proficiency'] == "L"]
    y_test_nonnat = y_test['author_id'].loc[X_test_all_proficiencies['proficiency'] == "L"]
    X_test_nonnat = X_test_nonnat.drop(['proficiency'], axis = 1)
    
    # Predict native and non-native
    pred_all = model.predict(X_test)
    accuracy_all = accuracy_score(y_test['author_id'], pred_all)
    f1_all = f1_score(y_test['author_id'], pred_all, average='weighted')
        
    if X_test_native.shape[0] > 0:
        pred_native = model.predict(X_test_native)
        accuracy_native = accuracy_score(y_test_native, pred_native)
        f1_native = f1_score(y_test_native, pred_native, average='weighted')
    else:
        accuracy_native = 0
        f1_native = 0
        
    if X_test_nonnat.shape[0] > 0:
        pred_nonnat = model.predict(X_test_nonnat)
        accuracy_nonnat = accuracy_score(y_test_nonnat, pred_nonnat)
        f1_nonnat = f1_score(y_test_nonnat, pred_nonnat, average='weighted')
    else:
        accuracy_nonnat = 0
        f1_nonnat = 0
            
    print('SVM ' + model.kernel + ' kernel results for ' + filetag + ':')
    print(' ')
    print('- Accuracy')
    print('-- Overall: ', "%.2f" % (accuracy_all*100))
    print('-- Native:', "%.2f" % (accuracy_native*100))
    print('-- Non-Native:', "%.2f" % (accuracy_nonnat*100))
    print(' ')
    print('- F1 Score')
    print('-- Overall:', "%.2f" % (f1_all*100))
    print('-- Native:', "%.2f" % (f1_native*100))
    print('-- Non-Native:', "%.2f" % (f1_nonnat*100))
    print(' ')
    print(' ')
    print('Results are for ' + str(len(set(y_test_native))) + ' native ' + str(len(set(y_test_nonnat))) + ' non-native english authors. Model trained on ' + str(train_proficiency.shape[0]) + ' feeds and tested on ' + str(test_proficiency.shape[0]) + ' feeds.') 
    print(' ')
    print(' ')
    print(' ')
    print(' ')
    
classify("cohort_all", "linear")
classify("cohort_native", "linear")
classify("cohort_nonnat", "linear")

#classify("cohort_all", "rbf")
#classify("cohort_native", "rbf")
#classify("cohort_nonnat", "rbf")

#classify("cohort_all", "poly")
#classify("cohort_native", "poly")
#classify("cohort_nonnat", "poly")

SVM linear kernel results for cohort_all:
 
- Accuracy
-- Overall:  50.00
-- Native: 46.67
-- Non-Native: 53.33
 
- F1 Score
-- Overall: 47.73
-- Native: 51.33
-- Non-Native: 58.89
 
 
Results are for 15 native 15 non-native english authors. Model trained on 540 feeds and tested on 60 feeds.
 
 
 
 
SVM linear kernel results for cohort_native:
 
- Accuracy
-- Overall:  55.00
-- Native: 55.00
-- Non-Native: 0.00
 
- F1 Score
-- Overall: 49.02
-- Native: 49.02
-- Non-Native: 0.00
 
 
Results are for 30 native 0 non-native english authors. Model trained on 540 feeds and tested on 60 feeds.
 
 
 
 
SVM linear kernel results for cohort_nonnat:
 
- Accuracy
-- Overall:  48.33
-- Native: 0.00
-- Non-Native: 48.33
 
- F1 Score
-- Overall: 46.56
-- Native: 0.00
-- Non-Native: 46.56
 
 
Results are for 0 native 30 non-native english authors. Model trained on 540 feeds and tested on 60 feeds.
 
 
 
 
