In [None]:
import numpy as np
import math
from scipy import linalg
from sklearn.covariance import GraphicalLassoCV, ledoit_wolf, empirical_covariance
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)
import warnings
warnings.simplefilter(action='ignore', category=Warning)
from google.colab import drive
drive.mount('/content/gdrive')
# #############################################################################
infile = "/content/gdrive/Shared drives/4.Venetian Canals/Joined MG Files/Genus.Frqs.All.MGs.Joined.csv"
with open(infile, 'r') as f:
  data = f.read() 
  print('Start of file:',repr(data[:50]))
  print('End of file:', repr(data[-50:]))
f.close()
lines = data.split('\n')
lines.pop()

n_samples = 10
n_features = len(lines[0]) -1
table, genera = [], []
for x in range(n_samples):
  row = lines[x].split(',')
  genera.append(row[0])
  for y in range(1, len(row)):
    row[y] = 1/float(row[y]) 
  table.append(row[1:])
print()
#print('table:\n', table)
#print()

frame ={}
for x in range(len(table)):
  frame[ genera[x] ] = table[x]  
print(len(frame), frame)

df = pd.DataFrame(frame)
print()
#print('df covariance:\n', empirical_covariance(df))

#print('df', df)
covMatrix = pd.DataFrame.cov(df)
"""
print ('covMatrix:\n', covMatrix)
print()
print('covMatrix:')
plt.matshow(covMatrix, cmap='RdBu')  #RdBu_r, spring, autumn, Reds, plasma, RdPu, Purples, jet
plt.show()
"""
cov = linalg.inv(covMatrix)
d = np.sqrt(np.diag(cov))
cov /= d
cov /= d[:, np.newaxis]
covMatrix *= d
covMatrix *= d[:, np.newaxis]
X = df
X -= X.mean(axis=0)
X /= X.std(axis=0)

# #############################################################################
# Estimate the covariance
emp_cov = np.dot(X.T, X) / n_samples

model =  GraphicalLassoCV()  #empirical_covariance)
model.fit(X)
covGL = model.covariance_
prec_ = model.precision_

lw_cov_, _ = ledoit_wolf(X)
lw_prec_ = linalg.inv(covGL)

# #############################################################################
# Plot the results
plt.figure(figsize=(10, 6))
plt.subplots_adjust(left=0.02, right=0.98)

# plot the covariances
covs = [('Empirical', emp_cov), ('Ledoit-Wolf', lw_cov_), ('True', cov),  ('GraphicalLassoCV', covGL)]

#print('Graphical Lasso Covariance:\n', cov_)
print(genera)       
vmax = covGL.max()
for i, (name, this_cov) in enumerate(covs):
    plt.subplot(2, 4, i + 1)
    plt.imshow(this_cov, interpolation='nearest', vmin=-vmax, vmax=vmax,
               cmap=plt.cm.RdBu_r)
    plt.xticks(())
    plt.yticks(())
    plt.title('%s covariance' % name)
plt.show()


In [None]:
### plot "keywords" with the Covar Graphical Lasso algorithm 

Keywords = [ 'Gammaproteobacteria', 'Beggiatoa', 'Deltaproteobacteria', 'Desulfobacterium', 'Nitrosopumilus', 'Bacteroides',
            'Desulfatibacillum', 'Planctomyces', 'Geobacter', 'Desulfococcus' ]
         
n_samples = 80

table, genera = [], []
for x in range(len(Keywords)):
  for y in range(n_samples):
    row = lines[y].split(',')
    if Keywords[x].lower() in row[0].lower():
      #print(row)
      genera.append(row[0])
      for y in range(1, len(row)):
        if float(row[y]) > 0.0: row[y] = 1/float(row[y])
        else: row[y] = 10000
      table.append(row[1:])
print()

frame ={}
for x in range(len(table)):
  frame[ genera[x] ] = table[x]  
#print(frame)

df = pd.DataFrame(frame)
print()
#print('df', df)
covMatrix = pd.DataFrame.cov(df)
#print ('covMatrix:\n', covMatrix)

#plt.matshow(covMatrix, cmap='Reds')  #RdBu_r, spring, autumn, Reds, plasma, RdPu, Purples, jet
#plt.show()

cov = linalg.inv(covMatrix)
d = np.sqrt(np.diag(cov))
cov /= d
cov /= d[:, np.newaxis]
covMatrix *= d
covMatrix *= d[:, np.newaxis]
X = df  
X -= X.mean(axis=0)
X /= X.std(axis=0)

# #############################################################################
# Estimate the covariance
emp_cov = np.dot(X.T, X) / n_samples
model =  GraphicalLassoCV()   
model.fit(X)
covGL = model.covariance_
lw_prec_ = linalg.inv(covGL)

cov_df = pd.DataFrame(covGL)
print('Covariation Matrix:')
print(cov_df)
print()
print('List of Keywords:\n')
print(Keywords ,'\n')

# #############################################################################
# define the plot parameters
 
def circular_plot1(x,y,z):
  plt.plot(m,n,'kx', marker ='o', markersize= '7')#,fillstyle='none')
  labels = [coords[i-1][0][:] for i in range(1,len(genera)+1)]  #'{0}'.format(i) +' '+  FOR PUTTING  NUMBERS IN FRONT OF TAXON NAMES
  for label in range(len(genera)):
    plt.annotate(labels[label], xy = (p[label], q[label]), xytext = (0, 0), textcoords = 'offset points', va='center', ha='center', fontsize=8)   
  plt.xlim(-1.2,1.2)
  plt.ylim(-1.2,1.2)

# Plot the results
m, n, p, q, r, s, kx, ky =[], [], [], [], [], [], [], []
coords =[]
for w in range(0, len(genera)):
   x = (math.cos(2 * math.pi * (w +1) / (len(genera))))
   y = (math.sin(2 * math.pi * (w +1) / (len(genera))))
   e = x * 1.5
   f = y * 1.5
   m.append(x)
   n.append(y)
   p.append(x * 1.1)
   q.append(y * 1.1)
   coords.append([genera[w][:], [m[w], n[w]]])
circular_plot1(m,n,'k+')
plt.xticks(())
plt.yticks(())

lw =3
for a1 in range(len(genera)):
  for b1 in range(len(genera)):
    if covGL[a1][b1] >0:
      plt.plot([coords[a1][1][0], coords[b1][1][0]] , [coords[a1][1][1], coords[b1][1][1]], color = 'r', linewidth= covGL[a1][b1] * lw)
    else: 
      plt.plot([coords[a1][1][0], coords[b1][1][0]] , [coords[a1][1][1], coords[b1][1][1]], color = 'b', linewidth= covGL[a1][b1] * lw)
plt.show()
print()
print('List of Keywords:\n')
print(Keywords ,'\n')
plt.figure(figsize=(10, 6))
plt.subplots_adjust(left=0.02, right=0.98)

# plot the covariances
#print('Graphical Lasso Covariance:\n', covGL)
vmax = covGL.max()
plt.imshow(covGL, interpolation='nearest', vmin=-vmax, vmax=vmax, cmap=plt.cm.RdBu_r)
plt.xticks(())
plt.yticks(())
#plt.title('Graphical Lasso CV')
plt.show()
