In [None]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk import Tree
from nltk.tag import pos_tag
from nltk.tree.prettyprinter import TreePrettyPrinter
import stanza
import re
from tqdm import tqdm
from collections import Counter
import spacy
import lftk
from nltk.corpus import brown
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk import ngrams
from language_tool_python import LanguageTool
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.probability import FreqDist
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('tagsets')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [5]:
def load_data(path):
    prompt_1 = pd.read_csv(path+'prompt_1.csv')
    prompt_2 = pd.read_csv(path+'prompt_2.csv')
    prompt_3 = pd.read_csv(path+'prompt_3.csv')
    prompt_4 = pd.read_csv(path+'prompt_4.csv')
    prompt_5 = pd.read_csv(path+'prompt_5.csv')
    prompt_6 = pd.read_csv(path+'prompt_6.csv')
    prompt_7 = pd.read_csv(path+'prompt_7.csv')
    prompt_8 = pd.read_csv(path+'prompt_8.csv')
    prompt_9 = pd.read_csv(path+'prompt_9.csv')
    prompt_10 = pd.read_csv(path+'prompt_10.csv')
    prompt_11 = pd.read_csv(path+'prompt_11.csv')
    prompt_12 = pd.read_csv(path+'prompt_12.csv')
    return prompt_1, prompt_2, prompt_3, prompt_4, prompt_5, prompt_6, prompt_7, prompt_8, prompt_9, prompt_10, prompt_11, prompt_12

def tree_height(root):
    if not list(root.children):
        return 1
    else:
        return 1 + max(tree_height(x) for x in root.children)

# avg length of words, avg length of sentences, type-token-ratio, formality
def get_features_one(data):
    nlp = spacy.load("en_core_web_sm")
    docs = []
    for text in tqdm(data['Text'].tolist()):
        doc = nlp(text)
        docs.append(doc)
    LFTK = lftk.Extractor(docs=docs)
    extracted_features = LFTK.extract(features = ['t_word', 't_sent', 'a_word_ps', 'a_char_pw', 'simp_ttr', 'n_noun', 'n_adj', 'n_pron', 'n_det', 'n_adp', 'n_verb', 'n_adv', 'n_intj'])
    features = pd.DataFrame(extracted_features)
    features['formality'] = ((features['n_noun']+features['n_adj']+features['n_adj']+features['n_det'])/features['t_word'] - (features['n_adp']+features['n_verb']+features['n_adv']+features['n_intj'])/features['t_word'] + 100)/2
    return features.drop(columns=['n_noun', 'n_adj', 'n_pron', 'n_det', 'n_adp', 'n_verb', 'n_adv', 'n_intj', 't_word', 't_sent'])

# # of commas, # of apostrophe, # of period marks, # of exclamation marks, # of question marks
def get_features_two(data):
    result = []
    punctuation_pattern = r"[^\w\s]"
    for text in tqdm(data['Text'].tolist()):
        tokenizer = RegexpTokenizer(punctuation_pattern)
        punctuation_symbols = tokenizer.tokenize(text)
        comma_count = punctuation_symbols.count(',')
        apostrophe_count = punctuation_symbols.count("'")
        period_count = punctuation_symbols.count(".")
        exclamation_count = punctuation_symbols.count("!")
        question_count = punctuation_symbols.count("?")
        result.append([comma_count, apostrophe_count, period_count, exclamation_count, question_count])
    return pd.DataFrame(result, columns=['n_comma', 'n_apostrophe', 'n_period', 'n_exclamation', 'n_question'])


def get_features_three(data):
    result = []
    nlp = stanza.Pipeline(lang='en', processors='tokenize, pos, constituency', use_gpu=True, pos_batch_size=3000)
    for text in tqdm(data['Text'].tolist()):
        doc = nlp(text)
        tree_heights = [tree_height(sentence.constituency) for sentence in doc.sentences]
        avg_tree_height = sum(tree_heights) / len(tree_heights)
        num_NPs = [len(re.findall("NP", str(sentence.constituency))) for sentence in doc.sentences]
        avg_num_NPs = sum(num_NPs) / len(num_NPs)
        num_VPs = [len(re.findall("VP", str(sentence.constituency))) for sentence in doc.sentences]
        avg_num_VPs = sum(num_VPs) / len(num_VPs)
        num_SBARs = [len(re.findall("SBAR", str(sentence.constituency))) for sentence in doc.sentences]
        avg_num_SBARs = sum(num_SBARs) / len(num_SBARs)
        result.append([avg_tree_height, avg_num_NPs, avg_num_VPs, avg_num_SBARs])
    return pd.DataFrame(result, columns=['avg_tree_height', 'avg_num_NPs', 'avg_num_VPs', 'avg_num_SBARs'])

# average word frequency in brown corpus, cohesion
def get_features_four(data):
    connectives_list = [
        'accordingly', 'conversely', 'to the right', 'soon', 'presently', 'after', 'also',
        'because of this', 'gradually', 'hence', 'to the left', 'then', 'but', 'and', 'of equal importance',
        'afterward', 'still', 'briefly', 'on the other side', 'across the hall', 'adjacent to',
        'as soon as', 'for this purpose', 'yet', 'and yet', 'in spite of this', 'as a result',
        'as a consequence', 'here', 'at last', 'directly ahead', 'before', 'beyond', 'on the other hand',
        'to repeat', 'at length', 'in the same way', 'such as', 'the next months', 'with this in mind',
        'below', 'just as important', 'as you turn right', 'in short', 'second', 'when', 'last of all',
        'in contrast', 'equally important', 'subsequently', 'consequently', 'from here on', 'furthermore',
        'thus', 'on the following day', 'next', 'ultimately', 'as you can see', 'further', 'behind',
        'besides', 'to be specific', 'finally', 'on the whole', 'to illustrate', 'in the meantime',
        'nearby', 'similarly', 'as I have said', 'nonetheless', 'at this point', 'to this end',
        'in the end', 'at the top', 'in addition', 'for example', 'in the background', 'thereafter',
        'the next week', 'lastly', 'for instance', 'or', 'in conclusion', 'after a short time', 'like',
        'the next day', 'since', 'along the wall', 'first', 'there', 'nevertheless', 'too', 'opposite',
        'above', 'as so', 'moreover', 'in fact', 'in the same manner', 'last', 'therefore', 'on the contrary',
        'however', 'so', 'now', 'to begin with', 'another', 'a minute later', 'meanwhile', 'to sum up',
        'actually', 'for this reason', 'later', 'in summary'
    ]
    
    result = []
    stemmer = PorterStemmer()
    word_counts = Counter()
    for category in brown.categories():
        for word in brown.words(categories=category):
            word = word.lower()
            word = stemmer.stem(word)
            word_counts[word] += 1
    
    for text in tqdm(data['Text'].tolist()):
        words = word_tokenize(text)
        total_frequency = 0
        for word in words:
            word = word.lower()
            word = stemmer.stem(word)
            if word in word_counts.keys():
                total_frequency += word_counts[word]
                
        total_connectives = 0
        for connective in connectives_list:
            total_connectives += len(connective.split(' ')) * len(re.findall(r'\b{}\b'.format(re.escape(connective)), text.lower()))
            
        cohesion = total_connectives/len(words)
        frequency = total_frequency/len(words)
        result.append([frequency, cohesion])
        
    return pd.DataFrame(result, columns=['frequency', 'cohesion'])

# n-gram overlap coherence, redundancy of nouns coherence
def get_features_five(data):
    result = []
    lemmatizer = WordNetLemmatizer()
    for text in tqdm(data['Text'].tolist()):
        sentences = sent_tokenize(text.lower())
        overlaps = []
        nouns_overlaps = []
        for i in range(len(sentences)-1):
            redundancy = 0.0
            nouns_redundancy = 0.0
            
            tokens1 = [lemmatizer.lemmatize(word) for word in word_tokenize(sentences[i])]
            tokens2 = [lemmatizer.lemmatize(word) for word in word_tokenize(sentences[i+1])]
            tagged1 = pos_tag(tokens1)
            tagged2 = pos_tag(tokens2)
            nouns1 = [token for token, pos in tagged1 if pos.startswith("NN")]
            nouns2 = [token for token, pos in tagged2 if pos.startswith("NN")]
            
            nouns_overlap = set(nouns1) & set(nouns2)
            total_nouns = set(nouns1) | set(nouns2)
            
            if len(total_nouns) > 0:
                nouns_redundancy = len(nouns_overlap)/len(total_nouns)
                
            unigrams1 = list(ngrams(tokens1, 1))
            unigrams2 = list(ngrams(tokens2, 1))
            bigrams1 = list(ngrams(tokens1, 2))
            bigrams2 = list(ngrams(tokens2, 2))
            trigrams1 = list(ngrams(tokens1, 3))
            trigrams2 = list(ngrams(tokens2, 3))
            
            overlap_count_1 = sum((Counter(unigrams1) & Counter(unigrams2)).values())
            overlap_count_2 = sum((Counter(bigrams1) & Counter(bigrams2)).values())
            overlap_count_3 = sum((Counter(trigrams1) & Counter(trigrams2)).values())
            
            total_ngram_1 = len(unigrams1) + len(unigrams2)
            total_ngram_2 = len(unigrams1) + len(unigrams2)
            total_ngram_3 = len(unigrams1) + len(unigrams2)
            
            if total_ngram_1>0:
                redundancy += overlap_count_1 / total_ngram_1
            if total_ngram_2>0:
                redundancy += overlap_count_2 / total_ngram_2
            if total_ngram_3>0:
                redundancy += overlap_count_3 / total_ngram_3
                
            overlaps.append(redundancy)
            nouns_overlaps.append(nouns_redundancy)
        ngram_overlap_score = 0.0
        nouns_overlap_score = 0.0
        if len(overlaps) > 0:
            ngram_overlap_score = sum(overlaps)/len(overlaps)
        if len(nouns_overlaps) > 0:
            nouns_overlap_score = sum(nouns_overlaps)/len(nouns_overlaps)
        result.append([ngram_overlap_score, nouns_overlap_score])
    return pd.DataFrame(result, columns=['ngram_overlap_score', 'nouns_overlap_score'])

# # of errors, readability (Flesch, Coleman-Liau, ARI, Kincaid, FOG, Lix, and SMOG), corpus similarity
def get_features_six(data):
    result = []
    tool = LanguageTool('en-US')
    brown_words = brown.words()
    brown_freq_dist = FreqDist(brown_words)
    total_brown_words = len(brown_words)
    brown_prob_dist = {word: count / total_brown_words for word, count in brown_freq_dist.items()}
    
    for text in tqdm(data['Text'].tolist()):
        errors = tool.check(text)
        flesch = textstat.flesch_reading_ease(text)
        coleman_liau = textstat.coleman_liau_index(text)
        ari = textstat.automated_readability_index(text)
        kincaid = textstat.flesch_kincaid_grade(text)
        fog = textstat.gunning_fog(text)
        lix = textstat.lix(text)
        smog = textstat.smog_index(text)

        document_words = nltk.word_tokenize(text)
        document_freq_dist = FreqDist(document_words)
        total_document_words = len(document_words)
        document_prob_dist = {word: count / total_document_words for word, count in document_freq_dist.items()}
        kl_divergence = sum([document_prob_dist[word] * np.log2(document_prob_dist[word] / brown_prob_dist.get(word, 1e-10)) for word in document_prob_dist])
        result.append([len(errors), flesch, coleman_liau, ari, kincaid, fog, lix, smog, kl_divergence])
    return pd.DataFrame(result, columns=['num_errors', 'flesch_reading_ease', 'coleman_liau_index', 'automated_readability_index', 'flesch_kincaid_grade', 'gunning_fog', 'lix', 'smog_index', 'corpus_similarity'])

def generate_independent_features(data):
    features_independent = pd.concat([get_features_one(data), get_features_two(data), get_features_three(data), get_features_four(data), get_features_five(data), get_features_six(data)], axis=1)
    return features_independent

In [6]:
prompt_1, prompt_2, prompt_3, prompt_4, prompt_5, prompt_6, prompt_7, prompt_8, prompt_9, prompt_10, prompt_11, prompt_12 = load_data("")

In [9]:
prompts = [prompt_1, prompt_2, prompt_3, prompt_4, prompt_5, prompt_6, prompt_7, prompt_8, prompt_9, prompt_10, prompt_11, prompt_12]

In [None]:
counter = 1
for prompt in prompts:
    features_independent = generate_independent_features(prompt)
    features_independent.to_csv('prompt_'+str(counter)+'_features_independent.csv', index=False)
    counter += 1