In [1]:
# !pip -q install python-crfsuite
# !pip -q install category_encoders
# !pip -q install flair
# !pip -q install textstat
# !pip -q install pyspellchecker
# import nltk
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('words')

In [2]:
import pycrfsuite
import string
import nltk, re, math
from nltk import tag
import pandas as pd
import numpy as np
import textstat
import collections, itertools
from spellchecker import SpellChecker
from flair.data import Sentence
from flair.models import SequenceTagger
flatten = itertools.chain.from_iterable
TAGGER = SequenceTagger.load('pos')
tagger = pycrfsuite.Tagger()
tagger.open('model/model.crf.tagger')
POS_DICTIONARY = {}
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.corpus import wordnet
punct = set(string.punctuation)
lemmatizer = nltk.WordNetLemmatizer()
stopwords = stopwords.words('english')
porter = PorterStemmer()
SPELL = SpellChecker()
SPELL.word_frequency.load_words(["'s", "'m", "'re", "'ll" , "'ve", "'t", "'d"])

I1211 11:40:04.252305 139852714387264 file_utils.py:40] PyTorch version 1.3.1 available.


2019-12-11 11:40:04,285 loading file /home/nehas/.flair/models/en-pos-ontonotes-v0.4.pt


In [3]:
def lemmatize(token, tag):
    tag = {
        'N': wordnet.NOUN,
        'B': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }.get(tag[0], wordnet.NOUN)

    return lemmatizer.lemmatize(token, tag)

def tokenNormalizeText(text):
    #print "original:", text
    # Remove Emails
#     text = re.sub(r'\S*@\S*\s?', '', text)
#     # Remove website links
#     text = re.sub(r'http[s]?://\S+', '', text)
    # Remove distracting single quotes
    text = re.sub(r"\'", "", text)
    # Remove distracting double quotes
    text = re.sub(r'\"', "", text)
    # Remove new line characters
    text = re.sub(r'\s+', ' ', text)
    # word normalisation
    text = re.sub(r"(\w)([.,;:!?'/\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!?'/\"“\(])(\w)", r"\1 \2", text)
    # normalisation
    text = re.sub(r"(\S)\1\1+",r"\1\1\1", text)
    #tokenising
    
    tokens = list(flatten([re.split(r"\s+",t) for t in re.split('(\d+)',text)]))
    tokens = [re.sub(r'[^A-Za-z]+','',t) for t in tokens]
    tokens = [t.lower() for t in tokens]
    tokens = [t for t in tokens if t not in ' ']# and len(t) > 2]
    tokens = [w for w in tokens if w not in stopwords ]
    tokens = [lemmatize(token, tag) for token, tag in nltk.pos_tag(nltk.wordpunct_tokenize(' '.join(tokens)))]
#     tokens = [str(porter.stem(w)) for w in tokens]
    return ' '.join(tokens)

def normalizeText(text):
    # just in case
    s = str(text).lower()
#     s = s.lower()
#     s = re.sub('\s\W',' ',s) #  hyphens, apostrophes
#     s = re.sub('\W\s',' ',s)
#     s = re.sub('\s+',' ',s) # double spaces
    tokens = nltk.word_tokenize(s)
    return ' '.join(tokens)

In [4]:
def createDiagActFeatures(sentence):
    features = []
    if len(sentence.split())>=2:
        tagged_sent = tag.pos_tag(nltk.word_tokenize(sentence))
        for tagset in tagged_sent:
            features.append('TOKEN_'+tagset[0])
        for tagset in tagged_sent:
            features.append('POS_'+tagset[1])
        for words in sentence.split():
            features.append(words)
        features.append('/')
    return [[features]]

In [5]:
def flairPOSTag(words):
    if words in POS_DICTIONARY:
        return POS_DICTIONARY[words]
    else:
        postags = []
        text = str(words).replace(',', ', ').replace('.', '. ')
        sentence = Sentence(text, use_tokenizer=True)
        TAGGER.predict(sentence)
        for token in sentence:
            pos = token.get_tag('pos').value
            postags.append(pos)
        POS_DICTIONARY[words] = postags
        return postags

In [6]:
def calcFMeasure(text):
    tagged = flairPOSTag(text)

    freq = {}
    freq['noun'] = 0
    freq['adj'] = 0
    freq['prep'] = 0
    freq['art'] = 0
    freq['pron'] = 0
    freq['verb'] = 0
    freq['adv'] = 0
    freq['int'] = 0

    count = 0
    for i in range(len(tagged)):
        pos = tagged[i]
        if pos in ['NN', 'NNS', 'NNP', 'NNPS']:
            freq['noun'] += 1
        elif pos in ['JJ', 'JJR', 'JJS']:
            freq['adj'] += 1
        elif pos in ['IN']:
            freq['prep'] += 1
        elif pos in ['DET', 'DT', 'PDT', 'WDT']:
            freq['art'] += 1
        elif pos in ['PRP', 'PRP$', 'WP', 'WP$']:
            freq['pron'] += 1
        elif pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            freq['verb'] += 1
        elif pos in ['RB', 'RBR', 'RBS', 'WRB']:
            freq['adv'] += 1
        elif pos in ['UH']:
            freq['int'] += 1

        if pos not in ['$', "'", '(', ')', ',', '-', '.', ':', 'SYM', "''", '``']:
            count += 1

    for key in freq:
        freq[key] = (freq[key] / count) * 100

    fmeasure = 0.5 * ( (freq['noun'] + freq['adj'] + freq['prep'] + freq['art']) - (freq['pron'] + freq['verb'] + freq['adv'] + freq['int']) + 100 )

    return fmeasure

In [7]:
def POSFeatures(text):
    pos_tags = flairPOSTag(text)
    return ' '.join(pos_tags)

def POSTaggedFeatures(text):
    pos_tags = flairPOSTag(text)
    tagged_sent = []

    cleaned_text = str(text).replace(',', ', ').replace('.', '. ')
    sentence = Sentence(cleaned_text, use_tokenizer=True)

    for i in range(len(pos_tags)):
        tagged_sent.append(sentence[i].text + '_' + pos_tags[i])

    return ' '.join(tagged_sent)

In [8]:
def textTokenizer(text):
    text = re.sub("[/%-._]", " ", text)
    text = re.sub("[,()!;$?:~*]","", text)
    text = text.replace('"', '')
    text = text.replace(" '", '')
    text = text.replace("' ", '')
    tokens = nltk.word_tokenize(text)
    return tokens

In [9]:
def GetGenderPreferentialFeatures(text):
    f = []
    for i in range(10):
        f.append(0)
    for word in textTokenizer(text):
        word = word.lower()
        if word.endswith(('able')):
            f[0] += 1
        elif word.endswith(('al')):
            f[1] += 1
        elif word.endswith(('ful')):
            f[2] += 1
        elif word.endswith(('ible')):
            f[3] += 1
        elif word.endswith(('ic')):
            f[4] += 1
        elif word.endswith(('ive')):
            f[5] += 1
        elif word.endswith(('less')):
            f[6] += 1
        elif word.endswith(('ly')):
            f[7] += 1
        elif word.endswith(('ous')):
            f[8] += 1
        if word in ['sorry', 'penitent', 'contrite', 'repentant', 'remorseful', 'regretful', 'compunctious', 'touched', 'melted', 'sorrowful', 'apologetic', 'softened'
                      'sad', 'greived', 'mournful']:
            f[9] += 1
    return f

In [10]:
def GetFactorAnalysis(text):
    f = []
    words_in_factor = []
    # Conversation
    words_in_factor.append(['know', 'people', 'think', 'person', 'tell', 'feel', 'friends', 'talk', 'new', 'talking', 'mean', 'ask', 'understand', 
                    'feelings', 'care', 'thinking', 'friend', 'relationship', 'realize', 'question', 'answer', 'saying'])
    # AtHome
    words_in_factor.append(['woke', 'home', 'sleep', 'today', 'eat', 'tired', 'wake', 'watch', 'watched', 'dinner', 'ate', 'bed', 'day', 'house', 'tv', 'early', 'boring', 'yesterday', 'watching', 'sit'])
    # Family
    words_in_factor.append(['years', 'family', 'mother', 'children', 'father', 'kids', 'parents', 'old', 'year', 'child', 'son', 'married', 'sister', 'dad', 'brother', 'moved', 'age', 'young', 
                            'months', 'three', 'wife', 'living', 'college', 'four', 'high', 'five', 'died', 'six', 'baby', 'boy', 'spend', 'christmas'])
    # Time
    words_in_factor.append(['friday', 'saturday', 'weekend', 'week', 'sunday', 'night', 'monday', 'tuesday', 'thursday', 'wednesday', 'morning', 'tomorrow', 'tonight', 'evening', 'days', 
                            'afternoon', 'weeks', 'hours', 'july', 'busy', 'meeting', 'hour', 'month', 'june'])
    # Work
    words_in_factor.append(['work', 'working', 'job', 'trying', 'right', 'met', 'figure', 'meet', 'start', 'better', 'starting', 'try', 'worked', 'idea'])
    # PastActions
    words_in_factor.append(['said', 'asked', 'told', 'looked', 'walked', 'called', 'talked', 'wanted', 'kept', 'took', 'sat', 'gave', 'knew', 'felt', 'turned', 'stopped', 'saw', 'ran', 'tried', 
                            'picked', 'left', 'ended'])
    # Games
    words_in_factor.append(['game', 'games', 'team', 'win', 'play', 'played', 'playing', 'won', 'season', 'beat', 'final', 'two', 'hit', 'first', 'video', 'second', 'run', 'star', 'third', 'shot', 
                            'table', 'round', 'ten', 'chance', 'club', 'big', 'straight'])
    # Internet
    words_in_factor.append(['site', 'email', 'page', 'please', 'website', 'web', 'post', 'link', 'check', 'blog', 'mail', 'information', 'free', 'send', 'comments', 'comment', 'using', 
                            'internet', 'online', 'name', 'service', 'list', 'computer', 'add', 'thanks', 'update', 'message'])
    # Location
    words_in_factor.append(['street', 'place', 'town', 'road', 'city', 'walking', 'trip', 'headed', 'front', 'car', 'beer', 'apartment', 'bus', 'area', 'park', 'building', 'walk', 'small', 'places', 
                            'ride', 'driving', 'looking', 'local', 'sitting', 'drive', 'bar', 'bad', 'standing', 'floor', 'weather', 'beach', 'view'])
    # Fun
    words_in_factor.append(['fun', 'im', 'cool', 'mom', 'summer', 'awesome', 'lol', 'stuff', 'pretty', 'ill', 'mad', 'funny', 'weird'])
    # Food/Clothes
    words_in_factor.append(['food', 'eating', 'weight', 'lunch', 'water', 'hair', 'life', 'white', 'wearing', 'color', 'ice', 'red', 'fat', 'body', 'black', 'clothes', 'hot', 'drink', 'wear', 
                            'blue', 'minutes', 'shirt', 'green', 'coffee', 'total', 'store', 'shopping'])
    # Poetic
    words_in_factor.append(['eyes', 'heart', 'soul', 'pain', 'light', 'deep', 'smile', 'dreams', 'dark', 'hold', 'hands', 'head', 'hand', 'alone', 'sun', 'dream', 'mind', 'cold', 'fall', 'air', 
                            'voice', 'touch', 'blood', 'feet', 'words', 'hear', 'rain', 'mouth'])
    # Books/Movies
    words_in_factor.append(['book', 'read', 'reading', 'books', 'story', 'writing', 'written', 'movie', 'stories', 'movies', 'film', 'write', 'character', 'fact', 'thoughts', 
                            'title', 'short', 'take', 'wrote'])
    # Religion
    words_in_factor.append(['god', 'jesus', 'lord', 'church', 'earth', 'world', 'word', 'lives', 'power', 'human', 'believe', 'given', 'truth', 'thank', 'death', 'evil', 'own', 'peace', 
                            'speak', 'bring', 'truly'])
    # Romance
    words_in_factor.append(['forget', 'forever', 'remember', 'gone', 'true', 'face', 'spent', 'times', 'love', 'cry', 'hurt', 'wish', 'loved'])
    # Swearing
    words_in_factor.append(['shit', 'fuck', 'fucking', 'ass', 'bitch', 'damn', 'hell', 'sucks', 'stupid', 'hate', 'drunk', 'crap', 'kill', 'guy', 'gay', 'kid', 'sex', 'crazy', 'cunt', 'nigger', 'nigga', 
                            'asshole', 'pussy', 'dick', 'dickhead', 'faggot', 'fag'])
    # Politics
    words_in_factor.append(['bush', 'president', 'iraq', 'kerry', 'war', 'american', 'political', 'states', 'america', 'country', 'government', 'john', 'national', 'news', 'state', 'support', 
                            'issues', 'article', 'michael', 'bill', 'report', 'public', 'issue', 'history', 'party', 'york', 'law', 'major', 'act', 'fight', 'poor'])
    # Music
    words_in_factor.append(['music', 'songs', 'song', 'band', 'cd', 'rock', 'listening', 'listen', 'show', 'favorite', 'radio', 'sound', 'heard', 'shows', 'sounds', 'amazing', 'dance'])
    # School
    words_in_factor.append(['school', 'teacher', 'class', 'study', 'test', 'finish', 'english', 'students', 'period', 'paper', 'pass'])
    # Business
    words_in_factor.append(['system', 'based', 'process', 'business', 'control', 'example', 'personal', 'experience', 'general'])
    # Positive
    words_in_factor.append(['absolutely', 'abundance', 'ace', 'active', 'admirable', 'adore', 'agree', 'amazing', 'appealing', 'attraction', 'bargain', 'beaming', 'beautiful', 'best', 'better', 
                            'boost', 'breakthrough', 'breeze', 'brilliant', 'brimming', 'charming', 'clean', 'clear', 'colorful', 'compliment', 'confidence', 'cool', 'courteous', 'cuddly', 
                            'dazzling', 'delicious', 'delightful', 'dynamic', 'easy', 'ecstatic', 'efficient', 'enhance', 'enjoy', 'enormous', 'excellent', 'exotic', 'expert', 'exquisite', 
                            'flair', 'free', 'generous', 'genius', 'great', 'graceful', 'heavenly', 'ideal', 'immaculate', 'impressive', 'incredible', 'inspire', 'luxurious', 'outstanding', 
                            'royal', 'speed', 'splendid', 'spectacular', 'superb', 'sweet', 'sure', 'supreme', 'terrific', 'treat', 'treasure', 'ultra', 'unbeatable', 'ultimate', 'unique', 'wow', 'zest'])
    # Negative
    words_in_factor.append(['wrong', 'stupid', 'bad', 'evil', 'dumb', 'foolish', 'grotesque', 'harm', 'fear', 'horrible', 'idiot', 'lame', 'mean', 'poor', 'heinous', 'hideous', 'deficient', 
                            'petty', 'awful', 'hopeless', 'fool', 'risk', 'immoral', 'risky', 'spoil', 'spoiled', 'malign', 'vicious', 'wicked', 'fright', 'ugly', 'atrocious', 'moron', 'hate', 
                            'spiteful', 'meager', 'malicious', 'lacking'])
    # Emotion
    words_in_factor.append(['aggressive', 'alienated', 'angry', 'annoyed', 'anxious', 'careful', 'cautious', 'confused', 'curious', 'depressed', 'determined', 'disappointed', 'discouraged', 
                            'disgusted', 'ecstatic', 'embarrassed', 'enthusiastic', 'envious', 'excited', 'exhausted', 'frightened', 'frustrated', 'guilty', 'happy', 'helpless', 'hopeful', 
                            'hostile', 'humiliated', 'hurt', 'hysterical', 'innocent', 'interested', 'jealous', 'lonely', 'mischievous', 'miserable', 'optimistic', 'paranoid', 'peaceful', 
                            'proud', 'puzzled', 'regretful', 'relieved', 'sad', 'satisfied', 'shocked', 'shy', 'sorry', 'surprised', 'suspicious', 'thoughtful', 'undecided', 'withdrawn'])

    for i in range(len(words_in_factor)):
        f.append(0)
    for word in textTokenizer(text):
        word = word.lower()
        for i in range(len(words_in_factor)):
            if word in words_in_factor[i]:
                f[i] += 1
    return f

In [11]:
def GetTextStatInfo(text):
    le_c = textstat.lexicon_count(text, removepunct=True)
    ts = textstat.text_standard(text, float_output=True)

    return le_c, ts

In [12]:
def GetNumberOfIncorrectSpelling(text):
    tokens = textTokenizer(text)

    misspelled = SPELL.unknown(tokens)

    #for i in range(len(misspelled)):
    #    misspelled[i] = ''.join(e for e in misspelled[i] if e.isalnum())

    misspelled.discard('')
    return len(misspelled)

In [13]:
POS_DICTIONARY = np.ndarray.tolist(np.load("data/pos_dict.npy",allow_pickle=True))
df = pd.read_csv("train_test/training.csv",names=['text','character','gender'])

In [14]:
df['text_norm'] = df.apply(lambda x: normalizeText(str(x.text)),axis = 1)
df['token_text_norm'] = df.apply(lambda x: tokenNormalizeText(str(x.text)),axis = 1)
df['POS'] = df.apply(lambda x: POSFeatures(x.text), axis = 1)
df['POS_tagged'] = df.apply(lambda x: POSTaggedFeatures(x.text), axis = 1)
df['f_measure'] = df.apply(lambda x: calcFMeasure(x.text), axis=1)
# np.save("data/pos_dict.npy", POS_DICTIONARY)
df['word_count'] = df.apply(lambda x: len(textTokenizer(str(x.text))) , axis=1)
df['length'] = df.apply(lambda x: len(str(x.text)), axis = 1)
df['gf'] = df.apply(lambda x: GetGenderPreferentialFeatures(str(x.text)), axis = 1)
df[["GPF0","GPF1","GPF2","GPF3","GPF4","GPF5","GPF6","GPF7","GPF8","GPF9"]] = pd.DataFrame(df.gf.values.tolist(), index= df.index)
df['fa'] = df.apply(lambda x: GetFactorAnalysis(str(x.text)), axis = 1)
df[["F0","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22"]] = pd.DataFrame(df.fa.values.tolist(), index= df.index)
df['diag_act'] = df.apply(lambda x: [tagger.tag(xseq) for xseq in createDiagActFeatures(str(x.text))][0][0], axis=1)
mostCommonDiagAct = collections.Counter(df.diag_act.values.tolist()).most_common(8)
mostCommonDiagAct = [i[0] for i in mostCommonDiagAct]
def helper(a):
    if a not in mostCommonDiagAct:
        return 'othr'
    else: return a
df['diag_act'] = df.apply(lambda x: helper(x.diag_act),axis=1)
df['diag_act'].value_counts()
df = pd.get_dummies(df, columns=["diag_act"], prefix=["diag_act"])
df['LE_C'] = df.apply(lambda x: GetTextStatInfo(str(x.text))[0], axis = 1)
df['TS'] = df.apply(lambda x: GetTextStatInfo(str(x.text))[1], axis = 1)
df['mispelled'] = df.apply(lambda x: GetNumberOfIncorrectSpelling(str(x.text)), axis = 1)
df['gender'] = [1 if x =='male' else -1 for x in df['gender']] 


df.drop(['gf','fa','character'],axis=1).to_csv('data/train_gender.csv',index=False)
df.drop(['gf','fa','gender'],axis=1).to_csv('data/train_character.csv',index=False)

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('data/training_data.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
df.drop(['gf','fa','character'],axis=1).to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()


In [15]:
from MinePOSPats import MinePOSPats
pos_list = df['POS'].values.tolist()
mine_obj = MinePOSPats(pos_list, 0.3, 0.2)
pos_pats = mine_obj.MinePOSPats()

# Write POS Patterns to Text
with open('data/POSPatterns.txt', 'w') as file:
    patterns = []
    for pos_pat in pos_pats:
        pattern = ' '.join(pos_pat)
        patterns.append(pattern)
    file.write('\n'.join(patterns))

Start POS Mining
Total Amount of Documents: 10113
Total Amount of Unique POS: 19
Minimum Support: 3033
Minimum Adherence: 0.20
Stopped at k = 7
Extracted POS Patterns: 222


In [16]:
df = pd.read_csv("train_test/test.csv",names=['text','character','gender'])

In [17]:
df['text_norm'] = df.apply(lambda x: normalizeText(str(x.text)),axis = 1)
df['token_text_norm'] = df.apply(lambda x: tokenNormalizeText(str(x.text)),axis = 1)
df['POS'] = df.apply(lambda x: POSFeatures(x.text), axis = 1)
df['POS_tagged'] = df.apply(lambda x: POSTaggedFeatures(x.text), axis = 1)
df['f_measure'] = df.apply(lambda x: calcFMeasure(x.text), axis=1)
# np.save("data/pos_dict.npy", POS_DICTIONARY)
df['word_count'] = df.apply(lambda x: len(textTokenizer(str(x.text))) , axis=1)
df['length'] = df.apply(lambda x: len(str(x.text)), axis = 1)
df['gf'] = df.apply(lambda x: GetGenderPreferentialFeatures(str(x.text)), axis = 1)
df[["GPF0","GPF1","GPF2","GPF3","GPF4","GPF5","GPF6","GPF7","GPF8","GPF9"]] = pd.DataFrame(df.gf.values.tolist(), index= df.index)
df['fa'] = df.apply(lambda x: GetFactorAnalysis(str(x.text)), axis = 1)
df[["F0","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22"]] = pd.DataFrame(df.fa.values.tolist(), index= df.index)
df['diag_act'] = df.apply(lambda x: [tagger.tag(xseq) for xseq in createDiagActFeatures(str(x.text))][0][0], axis=1)
mostCommonDiagAct = collections.Counter(df.diag_act.values.tolist()).most_common(8)
mostCommonDiagAct = [i[0] for i in mostCommonDiagAct]
def helper(a):
    if a not in mostCommonDiagAct:
        return 'othr'
    else: return a
df['diag_act'] = df.apply(lambda x: helper(x.diag_act),axis=1)
df['diag_act'].value_counts()
df = pd.get_dummies(df, columns=["diag_act"], prefix=["diag_act"])
df['LE_C'] = df.apply(lambda x: GetTextStatInfo(str(x.text))[0], axis = 1)
df['TS'] = df.apply(lambda x: GetTextStatInfo(str(x.text))[1], axis = 1)
df['mispelled'] = df.apply(lambda x: GetNumberOfIncorrectSpelling(str(x.text)), axis = 1)
df['gender'] = [1 if x =='male' else -1 for x in df['gender']] 

df.drop(['gf','fa','character'],axis=1).to_csv('data/test_gender.csv',index=False)
df.drop(['gf','fa','gender'],axis=1).to_csv('data/test_character.csv',index=False)

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('data/testing_data.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
df.drop(['gf','fa','character'],axis=1).to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()


KeyError: 'col'