# 1. Clusters Number

In [1]:
import numpy as np
def smooth_signum(delta, eps = 0.2):
    return np.where(abs(delta)<=eps, 0, 1*np.sign(delta))

In [2]:
import pandas as pd
import numpy as np
calm = pd.read_csv('calm.csv', ';').sort_values(by = ['ID'])
frust = pd.read_csv('frustration.csv', ';').sort_values(by = ['ID'])
keys = ['ID']
for i in keys:
    del calm[i], frust[i]
keys = calm.columns
calm, frust = calm[keys].values, frust[keys].values
from sklearn.preprocessing import scale, MaxAbsScaler
delta = (frust-calm)
scaled_delta = MaxAbsScaler().fit_transform(delta)
delta = smooth_signum(scaled_delta, eps = 0.2)

In [120]:
# Score Function
def d(x):
    return np.linalg.norm(x, 2)/x.shape[0]
def bcd(X, y):
    classes = [i for i in range(min(y), max(y)+1)]
    c_X = X.mean(axis = 0)
    s = 0
    for y_ in classes:
        X_i = np.array([i for ind, i in enumerate(X) if y[ind]==y_])
        c_i = X_i.mean(axis=0)
        s += d(c_i - c_X)*X_i.shape[0]
    return s/(len(classes)*np.sqrt(X_i.shape[0]))
def wcd(X, y):
    classes = [i for i in range(min(y), max(y)+1)]
    s = 0
    for y_ in classes:
        X_i = np.array([i for ind, i in enumerate(X) if y[ind]==y_])
        c_i = X_i.mean(axis=0)
        s += sum([d(i-c_i) for i in X_i])/X_i.shape[0]
    return s
def balance(y):
    classes = [i for i in range(min(y), max(y)+1)]
    sizes = [len([j for j in y if j==i]) for i in classes]
    M = (max(sizes) - min(sizes))/len(y) * len(classes)
    return M
def SF(X,y):
    alpha = 1
    return 1 - 1/np.exp(np.exp(bcd(X,y) - wcd(X,y))) + alpha * np.exp(-balance(y))

In [135]:
res = dict()
gen = range(1, 10)
init = 10000

In [136]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
delta_ = PCA(10).fit_transform(delta)
res_y  = list()
for i in gen:
    y = KMeans(n_clusters=i, n_init = init).fit_predict(delta_)
    res_y.append(y)

In [137]:
cur_res =  list()
for y in res_y:
    cur_res.append(SF(delta_, y))
res['KMeans'] = cur_res

In [138]:
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
delta_ = PCA(10).fit_transform(delta)
res_y  = list()
CT = 'tied'
for i in gen:
    y = GaussianMixture(n_components=i, n_init = init, covariance_type = CT).fit_predict(delta_)
    res_y.append(y)

In [139]:
cur_res =  list()
for y in res_y:
    cur_res.append(SF(delta_, y))
res['Gauss_'+CT] = cur_res

In [140]:
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
delta_ = PCA(10).fit_transform(delta)
res_y  = list()
CT = 'diag'
for i in gen:
    y = GaussianMixture(n_components=i, n_init = init, covariance_type = CT).fit_predict(delta_)
    res_y.append(y)

In [141]:
cur_res =  list()
for y in res_y:
    cur_res.append(SF(delta_, y))
res['Gauss_'+CT] = cur_res

In [142]:
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
delta_ = PCA(10).fit_transform(delta)
res_y  = list()
CT = 'spherical'
for i in gen:
    y = GaussianMixture(n_components=i, n_init = init, covariance_type = CT).fit_predict(delta_)
    res_y.append(y)

In [143]:
cur_res =  list()
for y in res_y:
    cur_res.append(SF(delta_, y))
res['Gauss_'+CT] = cur_res

In [144]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
delta_ = PCA(10).fit_transform(delta)
res_y  = list()
dist = 'euclidean'
for i in gen:
    y = AgglomerativeClustering(n_clusters=i, affinity = dist).fit_predict(delta_)
    res_y.append(y)

In [145]:
cur_res =  list()
for y in res_y:
    cur_res.append(SF(delta_, y))
res['AgglomerativeClustering_' + dist] = cur_res

In [146]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
delta_ = PCA(10).fit_transform(delta)
res_y  = list()
dist = 'l1'
for i in gen:
    y = AgglomerativeClustering(n_clusters=i, affinity = dist, linkage = 'average').fit_predict(delta_)
    res_y.append(y)

In [147]:
cur_res =  list()
for y in res_y:
    cur_res.append(SF(delta_, y))
res['AgglomerativeClustering_' + dist] = cur_res

In [186]:
import matplotlib.pylab as plt
%matplotlib notebook
keys = res.keys()
for key in keys:
    plt.plot([i for i in gen], res[key])
plt.grid()
plt.legend(keys, loc='lower left')
plt.xlabel('Number Of Clusters')
plt.ylabel('Metric Value')
plt.savefig('Images/clusters_number.pdf')

<IPython.core.display.Javascript object>

# 2. Two Clusters

In [187]:
import pandas as pd
import numpy as np
calm = pd.read_csv('calm.csv', ';').sort_values(by = ['ID'])
frust = pd.read_csv('frustration.csv', ';').sort_values(by = ['ID'])
#keys = [i for i in calm.columns if 'Тональность' in i] + ['ID']
keys = ['ID']
for i in keys:
    del calm[i], frust[i]
keys = calm.columns
calm, frust = calm.values, frust.values
from sklearn.preprocessing import scale,  MaxAbsScaler
delta = (frust-calm)
scaled_delta = MaxAbsScaler().fit_transform(delta)
delta = smooth_signum(scaled_delta)

In [188]:
folder_results = 'Signum/'

## Clustering

In [189]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
delta_ = PCA(2).fit_transform(delta)
y = KMeans(n_clusters=2, n_init = 10000).fit_predict(PCA(10).fit_transform(delta))

import matplotlib.pylab as plt
%matplotlib notebook
classes = [i for i in range(min(y), max(y)+1)]
D = list()
for y_ in classes:
    delta_1 = delta_[[ind for ind,i in enumerate(delta_) if y[ind]==y_]]
    print(y_, len(delta_1))
for y_ in classes:
    delta_1 = delta_[[ind for ind,i in enumerate(delta_) if y[ind]==y_]]
    D.append(scaled_delta[[ind for ind,i in enumerate(delta_) if y[ind]==y_]].mean(axis = 0))
    plt.plot(delta_1[:,0], delta_1[:,1], 'o')
plt.legend(classes)
plt.savefig('Images/' + folder_results +'Cluster.pdf')

0 48
1 53


<IPython.core.display.Javascript object>

## The Most Different Features

In [190]:
cluster_delta = list(np.absolute(D[0]- D[1])**2)
cluster_delta = [(i, ind) for ind, i in enumerate(cluster_delta)]
cluster_delta.sort(key = lambda x: x[0])
import matplotlib.pyplot as plt
% matplotlib notebook
plt.plot([i for i in range(len(keys))], [i[0] for i in cluster_delta])
plt.title('Differences between mean feature values for clusters')
plt.xlabel('Sorted Keys Number')
plt.ylabel('Difference Value')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Difference Value')

In [191]:
m = 0.7*max(np.absolute(D[0]- D[1])**2)
main_keys = [i[1] for i in cluster_delta if i[0] > m]
ind = 0
print('Number of Main Keys',len(main_keys))
for ind in main_keys:
    print(keys[ind])

Number of Main Keys 2
Коэффициент Трейгера
Коэффициент опредмеченности действия (кол. глаголов / кол. существительных)


In [195]:
def f(delta_, delta, n = 11):
    M, m = delta.max(axis = 0), delta.min(axis = 0)
    res = dict()
    for i in range(delta_.shape[1]):
        x = np.linspace(-1, 1, n)
        res[i] = (delta_[:, i], x)
    return res
D = list()
for y_ in classes:
    delta_1 = scaled_delta[[ind for ind,i in enumerate(delta_) if y[ind]==y_]]
    D.append(f(delta_1, scaled_delta))
ind = 0

In [196]:
import numpy
from matplotlib import pyplot as plt
% matplotlib notebook
print(ind, keys[main_keys[ind]])
plt.hist([D[0][main_keys[ind]][0], D[1][main_keys[ind]][0]],
         D[0][main_keys[ind]][1], 
         label=['The Fisrt Cluster', 'The Second Cluster'], 
         density = True)
plt.xticks(list(np.linspace(-1, 1, 11)))
plt.grid()
plt.xlabel('Scaled Feature Value')
plt.ylabel('Normed Number')
plt.legend(loc='upper right')
plt.show()
ind+= 1
plt.savefig('Images/' + folder_results +str(ind) + '.pdf')

0 Коэффициент Трейгера


<IPython.core.display.Javascript object>

In [197]:
import numpy
from matplotlib import pyplot as plt
% matplotlib notebook
print(ind, keys[main_keys[ind]])
plt.hist([D[0][main_keys[ind]][0], D[1][main_keys[ind]][0]],
         D[0][main_keys[ind]][1], 
         label=['The Fisrt Cluster', 'The Second Cluster'], 
         density = True)
plt.xticks(list(np.linspace(-1, 1, 11)))
plt.grid()
plt.xlabel('Scaled Feature Value')
plt.ylabel('Normed Number')
plt.legend(loc='upper right')
plt.show()
ind+= 1
plt.savefig('Images/' + folder_results +str(ind) + '.pdf')

1 Коэффициент опредмеченности действия (кол. глаголов / кол. существительных)


<IPython.core.display.Javascript object>

# 3. Many Clusters

In [159]:
import pandas as pd
import numpy as np
calm = pd.read_csv('calm.csv', ';').sort_values(by = ['ID'])
frust = pd.read_csv('frustration.csv', ';').sort_values(by = ['ID'])
#keys = [i for i in calm.columns if 'Тональность' in i] + ['ID']
keys = ['ID']
for i in keys:
    del calm[i], frust[i]
keys = calm.columns
calm, frust = calm.values, frust.values
from sklearn.preprocessing import scale, MaxAbsScaler
delta = (frust-calm)
scaled_delta = MaxAbsScaler().fit_transform(delta)
delta = smooth_signum(scaled_delta, eps = 0.2)

In [160]:
folder_results = "ManyClusters/"

## Clustering

In [161]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
delta_ = PCA(2).fit_transform(delta)
y = KMeans(n_clusters=4, n_init = 10000).fit_predict(PCA(10).fit_transform(delta))
import matplotlib.pylab as plt
%matplotlib notebook
classes = [i for i in range(min(y), max(y)+1)]
D = list()
for y_ in classes:
    delta_1 = delta_[[ind for ind,i in enumerate(delta_) if y[ind]==y_]]
    print(y_, len(delta_1))
for y_ in classes:
    delta_1 = delta_[[ind for ind,i in enumerate(delta_) if y[ind]==y_]]
    D.append(delta[[ind for ind,i in enumerate(delta_) if y[ind]==y_]].mean(axis = 0))
    plt.plot(delta_1[:,0], delta_1[:,1], 'o')
plt.legend(classes)
plt.savefig('Images/' + folder_results +'Cluster.pdf')

0 24
1 40
2 18
3 19


<IPython.core.display.Javascript object>

In [175]:
fig, axs = plt.subplots(2, 3)
_ = list()
for i in axs:
    _ += list(i)
axs = _
delta_ = PCA(4).fit_transform(delta)
axes = [(0,1), (0,2), (0,3), (1,2), (1,3), (2,3)]
for ind,ax in enumerate(axs):
    for y_ in classes:
        delta_1 = delta_[[ind for ind,i in enumerate(delta_) if y[ind]==y_]]
        i, j = axes[ind]
        ax.plot(delta_1[:,i], delta_1[:,j], 'o')
plt.savefig('Images/' + folder_results +'Cluster_.pdf')

<IPython.core.display.Javascript object>

## The Most Different Features

In [163]:
def get_max(D):
    res = list()
    for i in D:
        for j in D:
            res.append((i-j)**2)
    res = np.array(res)
    return res.max(axis = 0)
cluster_delta = get_max(D)
cluster_delta = [(i, ind) for ind, i in enumerate(cluster_delta)]
cluster_delta.sort(key = lambda x: x[0])
import matplotlib.pyplot as plt
% matplotlib notebook
plt.plot([i for i in range(len(keys))], [i[0] for i in cluster_delta])
plt.title('Differences between mean feature values for clusters')
plt.xlabel('Sorted Keys Number')
plt.ylabel('Difference Value')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Difference Value')

In [164]:
m = 0.6*max([i[0] for i in cluster_delta])
main_keys = [i[1] for i in cluster_delta if i[0] > m]
ind = 0
print('Number of Main Keys',len(main_keys))
keys[main_keys]

Number of Main Keys 8


Index(['Доля глаголов 1 лица', 'Сем. связь: QNT', 'Сем. роль: адресат',
       'Число знаков пунктуации / Число слов',
       'Средняя длина слов (в количестве символов)',
       'Доля глаголов прошедшего времени, первого лица, единственного числа',
       'Коэффициент Трейгера',
       'Словарь: Лексика положительной рациональной оценки и ментальных действий'],
      dtype='object')

In [165]:
def f(delta_, delta, n = 11):
    M, m = delta.max(axis = 0), delta.min(axis = 0)
    res = dict()
    for i in range(delta_.shape[1]):
        x = np.linspace(-1, 1, n)
        res[i] = (delta_[:, i], x)
    return res
D = list()
for y_ in classes:
    delta_1 = scaled_delta[[ind for ind,i in enumerate(delta_) if y[ind]==y_]]
    D.append(f(delta_1, frust-calm))
ind = 0

In [166]:
import numpy
from matplotlib import pyplot as plt
% matplotlib notebook
print(ind, keys[main_keys[ind]])
plt.hist([i[main_keys[ind]][0] for i in D],
         D[0][main_keys[ind]][1], 
         label=classes, 
         density = True)
plt.xticks(np.linspace(-1,1, 11))
plt.grid()
plt.xlabel('Scaled Feature Value')
plt.ylabel('Normed Number')
plt.legend(loc='upper right')
plt.show()
ind+= 1
plt.savefig('Images/' + folder_results +str(ind) + '.pdf')

0 Доля глаголов 1 лица


<IPython.core.display.Javascript object>

In [167]:
import numpy
from matplotlib import pyplot as plt
% matplotlib notebook
print(ind, keys[main_keys[ind]])
plt.hist([i[main_keys[ind]][0] for i in D],
         D[0][main_keys[ind]][1], 
         label=classes, 
         density = True)
plt.xticks(np.linspace(-1,1, 11))
plt.grid()
plt.xlabel('Scaled Feature Value')
plt.ylabel('Normed Number')
plt.legend(loc='upper right')
plt.show()
ind+= 1
plt.savefig('Images/' + folder_results +str(ind) + '.pdf')

1 Сем. связь: QNT


<IPython.core.display.Javascript object>

In [168]:
import numpy
from matplotlib import pyplot as plt
% matplotlib notebook
print(ind, keys[main_keys[ind]])
plt.hist([i[main_keys[ind]][0] for i in D],
         D[0][main_keys[ind]][1], 
         label=classes, 
         density = True)
plt.xticks(np.linspace(-1,1, 11))
plt.grid()
plt.xlabel('Scaled Feature Value')
plt.ylabel('Normed Number')
plt.legend(loc='upper right')
plt.show()
ind+= 1
plt.savefig('Images/' + folder_results +str(ind) + '.pdf')

2 Сем. роль: адресат


<IPython.core.display.Javascript object>

In [169]:
import numpy
from matplotlib import pyplot as plt
% matplotlib notebook
print(ind, keys[main_keys[ind]])
plt.hist([i[main_keys[ind]][0] for i in D],
         D[0][main_keys[ind]][1], 
         label=classes, 
         density = True)
plt.xticks(np.linspace(-1,1, 11))
plt.grid()
plt.xlabel('Scaled Feature Value')
plt.ylabel('Normed Number')
plt.legend(loc='upper right')
plt.show()
ind+= 1
plt.savefig('Images/' + folder_results +str(ind) + '.pdf')

3 Число знаков пунктуации / Число слов


<IPython.core.display.Javascript object>

In [170]:
import numpy
from matplotlib import pyplot as plt
% matplotlib notebook
print(ind, keys[main_keys[ind]])
plt.hist([i[main_keys[ind]][0] for i in D],
         D[0][main_keys[ind]][1], 
         label=classes, 
         density = True)
plt.xticks(np.linspace(-1,1, 11))
plt.grid()
plt.xlabel('Scaled Feature Value')
plt.ylabel('Normed Number')
plt.legend(loc='upper right')
plt.show()
ind+= 1
plt.savefig('Images/' + folder_results +str(ind) + '.pdf')

4 Средняя длина слов (в количестве символов)


<IPython.core.display.Javascript object>

In [171]:
import numpy
from matplotlib import pyplot as plt
% matplotlib notebook
print(ind, keys[main_keys[ind]])
plt.hist([i[main_keys[ind]][0] for i in D],
         D[0][main_keys[ind]][1], 
         label=classes, 
         density = True)
plt.xticks(np.linspace(-1,1, 11))
plt.grid()
plt.xlabel('Scaled Feature Value')
plt.ylabel('Normed Number')
plt.legend(loc='upper right')
plt.show()
ind+= 1
plt.savefig('Images/' + folder_results +str(ind) + '.pdf')

5 Доля глаголов прошедшего времени, первого лица, единственного числа


<IPython.core.display.Javascript object>

In [172]:
import numpy
from matplotlib import pyplot as plt
% matplotlib notebook
print(ind, keys[main_keys[ind]])
plt.hist([i[main_keys[ind]][0] for i in D],
         D[0][main_keys[ind]][1], 
         label=classes, 
         density = True)
plt.xticks(np.linspace(-1,1, 11))
plt.grid()
plt.xlabel('Scaled Feature Value')
plt.ylabel('Normed Number')
plt.legend(loc='upper right')
plt.show()
ind+= 1
plt.savefig('Images/' + folder_results +str(ind) + '.pdf')

6 Коэффициент Трейгера


<IPython.core.display.Javascript object>

In [173]:
import numpy
from matplotlib import pyplot as plt
% matplotlib notebook
print(ind, keys[main_keys[ind]])
plt.hist([i[main_keys[ind]][0] for i in D],
         D[0][main_keys[ind]][1], 
         label=classes, 
         density = True)
plt.xticks(np.linspace(-1,1, 11))
plt.grid()
plt.xlabel('Scaled Feature Value')
plt.ylabel('Normed Number')
plt.legend(loc='upper right')
plt.show()
ind+= 1
plt.savefig('Images/' + folder_results +str(ind) + '.pdf')

7 Словарь: Лексика положительной рациональной оценки и ментальных действий


<IPython.core.display.Javascript object>