In [None]:
import spacy # R-MINI-CONDA
import pandas as pd

### Get current working directory

### Config Variables

In [None]:
#inputLanguage = "english"
nlp = spacy.load('de_core_news_lg') # en_core_web_lg
LOADFILE = "C:\\Users\\fenn\Desktop\\tryPython\\summarizedWords_LK.txt"
SAVEFILE = "C:\\Users\\fenn\Desktop\\tryPython\\distanceMatrix_LK.txt"  # change according to your needs
#print(pd.read_csv(LOADFILE, delimiter="\t").iloc[:, 0])  # for debug

### Data preparation

In [None]:
rawData = pd.read_csv(LOADFILE, delimiter="\t").iloc[:, 0].str.cat(others=None, sep=" ", na_rep=None, join='left') # reads in the file specified by LOADFILE, converting it to a single string of words separated by a blankspace




tokens = nlp(rawData)
print(len(tokens))
##cleans up data olny for concepts in the model
# Defined as a function to be used on multiple datasets if necessary 
def cleanData(data):
    cleanTokens = []
    for token in tokens:
        if not token.is_oov:
            cleanTokens.append(token) 
    return cleanTokens

cleanTokens = cleanData(tokens)
print(cleanTokens)
print(len(cleanTokens))

### determining string distances

In [None]:
## Creates a matrix of word similarities
def calcDistanceMatrix(cleanTokens):
    df = pd.DataFrame(columns=cleanTokens, index = cleanTokens) # builds al empty pandas dataframe with rows and columns named after the words to be analyzed
    for token in cleanTokens:
        similiarities = []
        for token2 in cleanTokens:
            similiarities.append(token.similarity(token2))
        df[token] = similiarities
    return df

distanceMatrixDF = calcDistanceMatrix(cleanTokens)


#### Printing the Matrix (for testing/debug only)

In [None]:
print(distanceMatrixDF)

### Exporting a .txt file to/with the location specified by the SAVEFILE variable

In [None]:
#export DataFrame to text file
with open(SAVEFILE, 'w') as f:  # overwrites existing files of the same name and path. If you want to change that: change line to with open(SAVEFILE, 'x') as f:
    distMatString = distanceMatrixDF.to_string(header=True, index=True)
    f.write(distMatString)

In [None]:
import spacy
nlp = spacy.load('en_core_web_lg')

doc = nlp("Apple is looking at buying U.K. startup for $1 billion. I like apples. Apple is a big company. New York, Red Flag, I have a red flag.")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label, ent.label_)

print(nlp.get_pipe('ner').labels)

In [None]:
import spacy
import numpy as np
nlp = spacy.load('en_core_web_lg')

def most_similar(word, topn=5):
  word = nlp.vocab[str(word)]
  queries = [
      w for w in word.vocab 
      if w.is_lower == word.is_lower  and np.count_nonzero(w.vector)
  ]

  by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
  return [(w.lower_,w.similarity(word)) for w in by_similarity[:topn+1] if w.lower_ != word.lower_]

print(most_similar("dog", topn=50))

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Number of nodes in each network
num_nodes = 50

# Probability of edge creation for Erdős-Rényi graphs
p = 0.2

# Generate three random networks
networks = [nx.erdos_renyi_graph(num_nodes, p) for _ in range(3)]

# Plot the degree distributions on a line graph
plt.figure(figsize=(10, 6))

for i, G in enumerate(networks):
    degrees = dict(G.degree())
    degree_values = list(degrees.values())
    print(sorted(degree_values))

    plt.plot(sorted(degree_values), label=f'Network {i + 1}', marker='o')

plt.title('Degree Distributions of Multiple Networks')
plt.xlabel('Sorted Nodes')
plt.ylabel('Degree')
plt.legend()
plt.show()


# Plot the degree distributions on a line graph
plt.figure(figsize=(10, 6))

for i, G in enumerate(networks):
    degrees = dict(G.degree())
    degree_values = sorted(list(degrees.values()))
    print(degree_values)

    # Calculate degree distribution
    hist, bins = np.histogram(degree_values, bins=range(1, max(degree_values) + 2), density=True)
    bin_centers = (bins[:-1] + bins[1:]) / 2.

    # Plot degree distribution on log-log scale
    plt.loglog(bin_centers, hist, 'o', label=f'Network {i + 1}')

plt.title('Degree Distributions of Multiple Networks')
plt.xlabel('Degree (log scale)')
plt.ylabel('Probability (log scale)')
plt.legend()
plt.show()

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

# Generate a scale-free network
scale_free_network = nx.scale_free_graph(1000, alpha=0.3, beta=0.2, gamma=0.5)

# Get degrees of all nodes
degrees = dict(scale_free_network.degree())

# Plot the degree distribution on a log-log scale
plt.figure(figsize=(8, 6))

# Convert degrees to a numpy array for easier manipulation
degree_values = np.array(list(degrees.values()))

# Calculate the histogram of degrees
hist, bins = np.histogram(degree_values, bins=20)

# Ensure no zero values (log(0) is undefined)
hist[hist == 0] = 1

# Plot the histogram on a log-log scale
plt.loglog(bins[:-1], hist, 'o', label='Degree Distribution')

plt.title('Scale-Free Network Degree Distribution')
plt.xlabel('Degree (log scale)')
plt.ylabel('Frequency (log scale)')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

# Let's assume we have a list of networks
networks = [nx.gnp_random_graph(100, 0.02), nx.gnp_random_graph(100, 0.02), nx.gnp_random_graph(100, 0.02)]

plt.figure(figsize=(10, 6))

for i, G in enumerate(networks):
    degrees = dict(G.degree())
    degree_values = sorted(set(degrees.values()))
    histogram = [list(degrees.values()).count(i)/float(nx.number_of_nodes(G)) for i in degree_values]

    # Plot degree distribution on log-log scale
    plt.loglog(degree_values, histogram, 'o', label=f'Network {i + 1}')

plt.title('Degree Distributions of Multiple Networks')
plt.xlabel('Degree (log scale)')
plt.ylabel('Probability (log scale)')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

# Let's generate networks using the Barabási–Albert model
networks = [nx.barabasi_albert_graph(100, 2), nx.barabasi_albert_graph(100, 2), nx.barabasi_albert_graph(100, 2)]

plt.figure(figsize=(10, 6))

for i, G in enumerate(networks):
    degrees = dict(G.degree())
    degree_values = sorted(set(degrees.values()))
    histogram = [list(degrees.values()).count(i)/float(nx.number_of_nodes(G)) for i in degree_values]

    # Plot degree distribution on log-log scale
    plt.loglog(degree_values, histogram, 'o', label=f'Network {i + 1}')

plt.title('Degree Distributions of Barabási–Albert Networks')
plt.xlabel('Degree (log scale)')
plt.ylabel('Probability (log scale)')
plt.legend()
plt.show()



In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

# Let's generate networks using the Barabási–Albert model
networks = [nx.barabasi_albert_graph(100, 2), nx.barabasi_albert_graph(100, 2), nx.barabasi_albert_graph(100, 2)]

fig, axs = plt.subplots(2, figsize=(10, 12))

for i, G in enumerate(networks):
    degrees = dict(G.degree())
    degree_values = sorted(set(degrees.values()))
    histogram = [list(degrees.values()).count(i)/float(nx.number_of_nodes(G)) for i in degree_values]

    # Plot degree distribution on log-log scale
    axs[0].loglog(degree_values, histogram, 'o', label=f'Network {i + 1}')

    # Plot degree distribution on normal scale
    axs[1].plot(degree_values, histogram, 'o', label=f'Network {i + 1}')

axs[0].set_title('Degree Distributions of Barabási–Albert Networks (Log-Log Scale)')
axs[0].set_xlabel('Degree (log scale)')
axs[0].set_ylabel('Probability (log scale)')

axs[1].set_title('Degree Distributions of Barabási–Albert Networks (Normal Scale)')
axs[1].set_xlabel('Degree')
axs[1].set_ylabel('Probability')

plt.legend()
plt.tight_layout()
plt.show()