In [1]:
from nltk import pos_tag
from sklearn.decomposition import PCA, LatentDirichletAllocation, NMF
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.covariance import GraphicalLasso
from sklearn.metrics import mean_squared_error
from nltk.corpus import stopwords, wordnet
from collections import Counter
import matplotlib.pyplot as plt
from time import time
import pickle
import csv
import operator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

#  https://python.gotrained.com/text-classification-with-pandas-scikit/ for basic bags-of-words
stop_words_extra = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself',
                    'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
                    'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
                    'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
                    'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as',
                    'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
                    'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off',
                    'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how',
                    'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
                    'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don',
                    'should', 'now', 'uses', 'use', 'using', 'used', 'one', 'also', 'br', 'href', 'ilink', 'whether']
stop_words = stopwords.words("english")
stop_words = stop_words + stop_words_extra



In [30]:

def preprocess(data):
    reviews_tokens = []
    for review in data:
        review = review.lower()  # Convert to lower-case words
        raw_word_tokens = re.findall(r'(?:\w+)', review, flags=re.UNICODE)  # remove pontuaction
        word_tokens = [w for w in raw_word_tokens if not w in stop_words]  # do not add stop words
        reviews_tokens.append(word_tokens)
    return reviews_tokens  # return all tokens


# def construct_bag_of_words(data):
#     corpus = preprocess(data)
#     bag_of_words = {}
#     word_count = 0
#     for sentence in corpus:
#         for word in sentence:
#             if word not in bag_of_words:  # do not allow repetitions
#                 bag_of_words[word] = word_count  # set indexes
#                 word_count += 1
#     print(dict(Counter(bag_of_words).most_common(5)))
#     return bag_of_words  # index of letters

def construct_bag_of_words_freq(data):
    corpus = preprocess(data)
    bag_of_words = {}
    for sentence in corpus:
        for word in sentence:
            if word not in bag_of_words:  # do not allow repetitions
                bag_of_words[word] = 1  # set indexes
            else:
                bag_of_words[word] = bag_of_words[word] + 1

    bag_of_words_thres = {key: val for key, val in bag_of_words.items() if val > 200}
    print("bag of word counts (filtered): ")
    print(dict(Counter(bag_of_words_thres).most_common(200)))
    bag_of_words_noun = {key: val for key, val in bag_of_words_thres.items() if pos_tag([key])[0][1] == 'NN'}
    print("bag of word counts (noun): ")
    print(dict(Counter(bag_of_words_noun).most_common(100)))
    bag_of_words_adj = {key: val for key, val in bag_of_words_thres.items() if pos_tag([key])[0][1] == 'JJ'}
    print("bag of word counts (adjective): ")
    print(dict(Counter(bag_of_words_adj).most_common(100)))

    # ---------------- CHANGE TO bag_of_words_adj IF YOU WANT ADJECTIVE ----------------#
    index = 0
    for key, val in bag_of_words_thres.items():
        bag_of_words_thres[key] = index
        index += 1

    # bag_of_words_sorted = sorted(bag_of_words_noun.items(), key=operator.itemgetter(1))
    # convert back from freq to index
    return bag_of_words_thres


def featurize(sentence_tokens, bag_of_words):
    sentence_features = [0 for x in range(len(bag_of_words))]
    for word in sentence_tokens:
        if word in bag_of_words.keys():
            index = bag_of_words[word]
            sentence_features[index] += 1
    return sentence_features


def get_batch_features(data, bag_of_words):
    batch_features = []
    reviews_text_tokens = preprocess(data)
    for review_text in reviews_text_tokens:
        feature_review_text = featurize(review_text, bag_of_words)
        batch_features.append(feature_review_text)
    return batch_features



In [3]:
from pickle_func import pickle_dump, pickle_load
import pickle
import pandas as pd
pd.__version__

'0.24.2'

In [5]:
train = pickle_load("train80.pkl")
test = pickle_load("test20.pkl")

In [21]:
train.shape

(47956, 2249)

In [15]:
type(train)
train.index.values

array([11768, 57687, 37644, ..., 59908, 25096,  4222])

In [32]:
profile_df = pd.read_csv("profiles.csv")
profile_df = profile_df.drop(['last_online'], axis=1)
essay_columns = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']
essay = profile_df[essay_columns]

In [33]:
# Bags-of-words for essay features
essay = essay.replace(np.nan, '', regex=True)  # remove NaN
essay = essay[essay_columns].apply(lambda x: ' '.join(x), axis=1)  # concatenate essays into paragraph
essay = essay.str.replace('\d+', '')  # remove digits in the paragraph

bag_of_words = construct_bag_of_words_freq(essay)
essay_profile = get_batch_features(essay, bag_of_words)


bag of word counts (filtered): 
{'like': 136365, 'love': 121093, 'interests': 100662, 'class': 99837, 'good': 90184, 'music': 85483, 'friends': 84439, 'people': 79886, 'life': 74112, 'time': 72124, 'things': 65672, 'food': 64513, 'new': 59095, 'really': 54808, 'movies': 49534, 'get': 48644, 'know': 45993, 'work': 45515, 'want': 44714, 'books': 41222, 'family': 40707, 'think': 39993, 'going': 38257, 'anything': 38217, 'enjoy': 37585, 'much': 37545, 'go': 37217, 'fun': 34732, 'would': 34087, 'make': 34037, 'lot': 31938, 'amp': 31776, 'working': 30627, 'looking': 29768, 'always': 29388, 'pretty': 29107, 'someone': 28423, 'world': 27889, 'making': 27723, 'something': 27545, 'well': 27055, 'great': 26857, 'favorite': 26320, 'day': 24095, 'trying': 23885, 'live': 23699, 'find': 23630, 'way': 23294, 'read': 23196, 'back': 22989, 'person': 22978, 'years': 22974, 'say': 22099, 'home': 21992, 'many': 21742, 'try': 21696, 'big': 21460, 'around': 21264, 'see': 21146, 'right': 20840, 'little': 2031

In [36]:
bag_of_words.keys()



In [38]:
essay = np.asarray(essay_profile).astype(float)
essayDF = pd.DataFrame(data= essay, columns=list(bag_of_words.keys()))


In [39]:
essayDF.head

<bound method NDFrame.head of        would  love  think  kind  intellectual  either  smart  guy  dumb  say  \
0        2.0   4.0    1.0   1.0           1.0     1.0    1.0  2.0   1.0  1.0   
1        0.0   4.0    0.0   1.0           0.0     0.0    1.0  1.0   0.0  0.0   
2        0.0   1.0    1.0   1.0           1.0     0.0    0.0  0.0   0.0  1.0   
3        0.0   0.0    0.0   0.0           0.0     0.0    0.0  0.0   0.0  0.0   
4        0.0   0.0    0.0   0.0           0.0     0.0    0.0  0.0   0.0  0.0   
5        1.0   5.0    4.0   0.0           0.0     0.0    0.0  0.0   0.0  2.0   
6        1.0   4.0    0.0   0.0           0.0     0.0    0.0  0.0   0.0  0.0   
7        0.0   0.0    0.0   0.0           0.0     0.0    0.0  0.0   0.0  0.0   
8        0.0   3.0    1.0   1.0           0.0     0.0    0.0  0.0   0.0  0.0   
9        1.0   8.0    0.0   0.0           0.0     0.0    0.0  1.0   0.0  2.0   
10       1.0   6.0    1.0   0.0           0.0     1.0    0.0  0.0   0.0  0.0   
11       2

In [29]:
essay_train = essay.loc[train.index.values,:]
essay_train.shape

AttributeError: 'numpy.ndarray' object has no attribute 'loc'

In [22]:
essay_test = essay.loc[test.index.values, :]
essay_test.shape

(11990, 10)