In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import scipy
from sklearn.decomposition import TruncatedSVD, PCA, NMF, LatentDirichletAllocation
import h5py
from sklearn.mixture import GaussianMixture
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

### Open file

can open here or BOW encoding 'embedded_bow.h5' or TF-IDF encoding 'embedded_tfidf.h5'

In [2]:
path = os.getcwd()
filename = '/data/embedded_tfidf.h5'  # embedded_bow.h5 - BOW encoding, embedded_tfidf.h5 - TFIDF

with h5py.File(path + filename, 'r') as h5file:
    X_train = h5file['train'][:]
    Y_train = h5file['target'][:]
    h5file.close()
    
print(X_train.shape)
print(Y_train.shape)

(10868, 10861)
(10868,)


### Dimensionality reduction

by using the PCA, SVD (NMF and LDA are computationally expensive and don't give any improvement of performance)

In [3]:
# SVD
svd = TruncatedSVD(n_components=100, random_state=42)
X_train_SVD = svd.fit_transform(X_train)
X_train_SVD.shape

(10868, 100)

In [4]:
# PCA
pca = PCA(n_components=100, random_state=42)
X_train_PCA = pca.fit_transform(X_train)
X_train_PCA.shape

(10868, 100)

In [5]:
# NMF
# nmf = NMF(n_components=10, random_state=42)
# X_train_NMF = nmf.fit_transform(X_train)
# X_train_NMF.shape

In [6]:
# LDA
# LDir = LatentDirichletAllocation(n_components=10, random_state=42)
# X_train_LDA = LDir.fit_transform(X_train)
# X_train_LDA.shape


In [7]:
def train_model(X, Y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=42)
    model.fit(X_train, y_train)
    
    Y_pred = model.predict_proba(X_test)
    print('AUC', roc_auc_score(y_test, Y_pred[:, 1]))

    Y_pred = model.predict(X_test)
    print('Accuracy', accuracy_score(y_test, Y_pred))
    

### Checking the performance with Logistic Regression and Linear Discriminant Analysis

In [8]:
lda = LDA()
lr = LR()

print('SVD decomposition')
train_model(X_train_SVD, Y_train, lr)
print('---------')
train_model(X_train_SVD, Y_train, lda)

SVD decomposition
AUC 0.9121861281826162
Accuracy 0.8467198038013488
---------
AUC 0.9089103502097355
Accuracy 0.8626609442060086


In [9]:
lda = LDA()
lr = LR()

print('PCA decomposition')
train_model(X_train_PCA, Y_train, lr)
print('---------')
train_model(X_train_PCA, Y_train, lda)

PCA decomposition
AUC 0.9121022339283973
Accuracy 0.8467198038013488
---------
AUC 0.9078548434299092
Accuracy 0.8608215818516247


In [10]:
# lda = LDA()
# lr = LR()
# 
# train_model(X_train_NMF, Y_train, lr)
# print('---------')
# train_model(X_train_NMF, Y_train, lda)

In [11]:
# lda = LDA()
# lr = LR()

# train_model(X_train_LDA, Y_train, lr)
# print('---------')
# train_model(X_train_LDA, Y_train, lda)

### GMM - based feature extractor

In [12]:
# this code is taken from https://gist.github.com/danoneata/9927923
def to_fv(row):
    global gmm
    xx = np.atleast_2d(row)
    N = xx.shape[0]

    # Compute posterior probabilities
    Q = gmm.predict_proba(xx)  # NxK

    # Compute the sufficient statistics of descriptors
    Q_sum = np.sum(Q, 0)[:, np.newaxis] / N
    Q_xx = np.dot(Q.T, xx) / N
    Q_xx_2 = np.dot(Q.T, xx ** 2) / N

    # Compute derivatives with respect to
    # mixing weights, means and variances
    d_pi = Q_sum.squeeze() - gmm.weights_
    d_mu = Q_xx - Q_sum * gmm.means_
    d_sigma = ( - Q_xx_2 - Q_sum * gmm.means_ ** 2 + Q_sum * gmm.covariances_ + 2 * Q_xx * gmm.means_)

    # Merge derivatives into a
    # vector.
    return np.hstack((d_pi, d_mu.flatten(), d_sigma.flatten()))

assumption here is that the data came from the mixture of two Gaussian distributions

In [13]:
K = 2  # number of GMM componenets
gmm = GaussianMixture(n_components=K, covariance_type='diag', reg_covar=1e-4)
gmm.fit(X_train_SVD)

X_train_GMM = np.array([to_fv(X_train_SVD[row, :]) for row in range(X_train_SVD.shape[0])])
X_train_GMM.shape

(10868, 402)

In [14]:
lda = LDA()
lr = LR()

print('GMM features')
train_model(X_train_GMM, Y_train, lr)
print('---------')
train_model(X_train_GMM, Y_train, lda)

GMM features
AUC 0.9063779143498196
Accuracy 0.8393623543838136
---------
AUC 0.9080772607550482
Accuracy 0.8583690987124464




### Save data

can save here X_train_SVD, X_train_PCA or X_train_GMM with a appropriate name

In [15]:
with h5py.File('data/decomposed_tfidf_GMM.h5', 'w') as h5file:  # decomposed_BOW_{decomposition}.h5 if BOW encoding
    h5file.create_dataset('train', data=X_train_GMM)
    h5file.create_dataset('target', data=Y_train)
    h5file.close()