In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

<hr style="border-top: 5px solid black;">

<h1>Import Packages</h1>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<hr style="border-top: 5px solid black;">

<h1>Import Data</h1>

In [3]:
file = "authors_data.xlsx"
df = pd.read_excel(file, sheet_name=0)

<hr style="border-top: 5px solid black;">

<h1>Inspect Data</h1>

In [None]:
df.head()

In [None]:
authors = df.groupby(by=["ID", "Author"])["Title"].count().to_frame()
authors

<hr style="border-top: 5px solid black;">

<h1>Word Clouds from Dataframe</h1>

In [None]:
# Visualize wordcloud
from wordcloud import WordCloud, STOPWORDS

# Import Natural Language Toolkit
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

<hr style="border-top: 2px solid black;">

# Group All Titles by Author ID

 - Drop Duplicates
 - Group by and combine titles by author

In [None]:
author_titles_df = df.drop_duplicates(subset=['Title'], keep='last')
author_titles_df.head()

In [None]:
author_titles = author_titles_df.groupby(['ID', 'Author'], as_index = False).agg({'Title': ' '.join})
author_titles

<hr style="border-top: 2px solid black;">

# Word Tokenization

 - Action: Return a tokenized copy of string
 - word_tokenize(string)
 - Documentation: https://www.nltk.org/_modules/nltk/tokenize.html

In [None]:
# Create a list of lists that contain tokens for each word in each list
word_tokens = [word_tokenize(text) for text in author_titles.Title]

In [None]:
# Token processing, remove non alpha numeric tokens from each list
cleaned_tokens = [[word for word in item if word.isalnum()] for item in word_tokens]

In [None]:
# Add results to dataframe
author_titles["tokens"] = cleaned_tokens

<hr style="border-top: 2px solid black;">

# Word Token Lemmatizing

 - Action: Lemmatization is the process of converting a word to its base form.
 - WNlemmatizer = WordNetLemmatizer()
 - lemmanized = WNlemmatizer.lemmatize(token)
 - Documentation: https://www.nltk.org/_modules/nltk/stem/wordnet.html

In [None]:
lemm_list = []
WNlemmatizer = WordNetLemmatizer()
for index, series in author_titles["tokens"].iteritems():
    lemm_list.append([WNlemmatizer.lemmatize(token).lower() for token in series])

In [None]:
author_titles["lemm_tokens"] = lemm_list

<hr style="border-top: 2px solid black;">

# WordCloud Analysis

 - Reveals essential
 - Provides an overall sense of the text
 - Easy to grasp and engaging
 - wordcloud = WordCloud().generate(text)
 - Documentation: https://amueller.github.io/word_cloud/

## Step 1: Filter Text Using Custom Stop Word List

 - Combine stop words from wordcloud, nltk, and custom list

In [None]:
# Wordcloud stopwords
wc_sw = STOPWORDS
custom_sps = set(['made', 'nothing','able', 'given', 'wish', 'willing', 'wa', 'due', 'ha', 'did','etc', 'use', 'really', 'felt', 'personally', 'also', 'thing', 'well', 'little', 'got', 'one', 'lot', 'way', 'jus', 'sure'])
cust_nltk_sw = set(stopwords.words('english')).union(custom_sps).union(wc_sw)
print(f'The number of stop words in custom list is: {len(cust_nltk_sw)}')

## Step 2: Visualize WordCloud for all Title by Authors in dataframe

### Step 2a: Create function that converts tokenized text into a single string

In [None]:
# This will be used in the wordcloud
def convert_text(series):
    np_array = series.array
    text = ' '.join(str(v) for v in np_array[0])
    return text

### Step 2b: Create Wordcloud PNG files for each Author ID 

In [None]:
author_ids = author_titles.ID
nbr_features = len(author_titles.ID)

# for i in range(0,nbr_features):
for id_num, num in zip(author_ids, range(0,nbr_features)):
    fig = plt.figure(figsize=(30,15))
    fig.subplots_adjust(hspace=1, wspace=0.2)

    df0=author_titles[author_titles['ID']==id_num]['lemm_tokens']
    text = convert_text(df0)
    cloud = WordCloud(background_color="white", max_words=10, stopwords=cust_nltk_sw, collocations=False).generate(text)

    plt.imshow(cloud, interpolation='bilinear')
    plt.title(id_num, fontsize = 20)
    plt.axis('off')
    plt.savefig(f"{id_num}_wordcloud.png", bbox_inches='tight', dpi=300)

### Step 2c: Create figure that contains all wordclouds, each as a subplot

In [None]:
author_ids = author_titles.ID
nbr_features = len(author_titles.ID)
fig = plt.figure(figsize=(30,15))
fig.subplots_adjust(hspace=1, wspace=0.2)

for id_num, num in zip(author_ids, range(1,nbr_features)):
    df0=author_titles[author_titles['ID']==id_num]['lemm_tokens']
    text = convert_text(df0)
    cloud = WordCloud(background_color="white", max_words=10, stopwords=cust_nltk_sw, collocations=False).generate(text)
    ax = fig.add_subplot(4,3, num)
    ax.imshow(cloud, interpolation='bilinear')
    ax.set_title(id_num)
    ax.axis('off')

plt.tight_layout()
plt.show()
plt.savefig(f"all_wordcloud.png", bbox_inches='tight', dpi=300)

## Step 3: Visualize WordCloud for all Titles

 - Combine All Titles
 - Generate WordCloud

In [None]:
titles = ' '.join(author_titles_df["Title"])
title_list = []
title_list.append(titles)
title_df = pd.DataFrame.from_dict({'ID': 'ID-All', 'Title' : title_list})
title_df

In [None]:
# Create a list of lists that contain tokens for each word in each list
word_tokens = [word_tokenize(text) for text in title_df.Title]

# Token processing, remove non alpha numeric tokens from each list
cleaned_tokens = [[word for word in item if word.isalnum()] for item in word_tokens]
# Add results to dataframe
title_df["tokens"] = cleaned_tokens

lemm_list = []
WNlemmatizer = WordNetLemmatizer()
for index, series in title_df["tokens"].iteritems():
    lemm_list.append([WNlemmatizer.lemmatize(token).lower() for token in series])

title_df["lemm_tokens"] = lemm_list


# Wordcloud stopwords
wc_sw = STOPWORDS
custom_sps = set(['made', 'nothing','able', 'given', 'wish', 'willing', 'wa', 'due', 'ha', 'did','etc', 'use', 'really', 'felt', 'personally', 'also', 'thing', 'well', 'little', 'got', 'one', 'lot', 'way', 'jus', 'sure'])
cust_nltk_sw = set(stopwords.words('english')).union(custom_sps).union(wc_sw)
print(f'The number of stop words in custom list is: {len(cust_nltk_sw)}')

title_df

In [None]:
author_id = title_df.ID
nbr_features = len(title_df.ID)

fig = plt.figure(figsize=(30,15))

df0=title_df['lemm_tokens']
text = convert_text(df0)
cloud = WordCloud(background_color="white", max_words=10, stopwords=cust_nltk_sw, collocations=False).generate(text)

plt.imshow(cloud, interpolation='bilinear')
plt.title(title_df.ID.values, fontsize = 20)
plt.axis('off')
plt.savefig(f"{author_id}_wordcloud.png", bbox_inches='tight', dpi=300)

<hr style="border-top: 5px solid black;">

<h1>Create New DataFrame for Displaying WordCloud Images</h1>

 - Concatenate author_titles and titles
 - Add column for url
 - Add url
 - Export as CSV for tableau

In [None]:
frames = [title_df, author_titles]
word_cloud_df = pd.concat(frames, ignore_index=True)
word_cloud_df

In [None]:
url_lists = ['https://i.imgur.com/NGDC7RA.png','https://i.imgur.com/HE8dkGa.png','https://i.imgur.com/EtQDWsW.png', 'https://i.imgur.com/bxm8T6x.png', 'https://i.imgur.com/2Gch9y0.png','https://imgur.com/YlKbGz8', 'https://i.imgur.com/UzCst7H.png']
word_cloud_df["urls"] = url_lists

In [None]:
word_cloud_df.to_csv("id_wordclouds.csv", index_label=False)

<hr style="border-top: 5px solid black;">

<h1>Network Analysis</h1>

In [None]:
# libraries
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
# Build a dataframe with your connections
df = pd.DataFrame({ 'from':['A', 'B', 'C','A'], 'to':['D', 'A', 'E','C']})
 
# Build your graph
G=nx.from_pandas_edgelist(df, 'from', 'to')
 
# Chart with Custom edges:
nx.draw(G, with_labels=True, width=5, edge_color="skyblue", style="solid")

In [None]:
file = "authors_data.xlsx"
df = pd.read_excel(file, sheet_name=0)

In [None]:
df.head()

## Prepare Data for Network Analysis

 - Subset for projects with co-contributors
 - Subset for only ID, Co-ID and Title

In [None]:
network_df = df[df['Has_Contributors']=="Yes"][['ID', "Co-ID", "Title"]]
network_df.rename

In [None]:
# Build your graph
fig = plt.figure(figsize=(30,15))
G=nx.from_pandas_edgelist(network_df, 'Co-ID', 'ID')
 
# Chart with Custom edges:
nx.draw(G, with_labels=True, width=5, font_size=20, edge_color="skyblue", style="solid", alpha=0.75, font_weight="bold")
plt.show()

## Network that shows Authors Only

In [None]:
import matplotlib.pyplot as plt
authors = list(network_df.ID.unique())
co_contributors = list(network_df['Co-ID'].unique())


plt.figure(figsize=(12, 12))

# 1. Create the graph
g = nx.from_pandas_edgelist(network_df, source='Co-ID', target='ID') 

# 2. Create a layout for our nodes 
layout = nx.spring_layout(g,iterations=50)

# 3. Draw the parts we want
# Edges thin and grey
# Co-Contributors small and grey
# Authors sized according to their number of connections
# Authors blue
# Labels for Authors ONLY
# People who are highly connected are a highlighted color

# Go through every Authors name, ask the graph how many
# connections it has. Multiply that by 80 to get the circle size
author_size = [g.degree(author) * 80 for author in authors]
nx.draw_networkx_nodes(g, 
                       layout, 
                       nodelist=authors, 
                       node_size=author_size, # a LIST of sizes, based on g.degree
                       node_color='lightblue')

# Draw EVERYONE
nx.draw_networkx_nodes(g, layout, nodelist=co_contributors, node_color='#cccccc', node_size=100)

# Draw POPULAR PEOPLE
popular_people = [person for person in co_contributors if g.degree(person) > 1]
nx.draw_networkx_nodes(g, layout, nodelist=popular_people, node_color='orange', node_size=100)

nx.draw_networkx_edges(g, layout, width=1, edge_color="#cccccc")

node_labels = dict(zip(authors, authors))
nx.draw_networkx_labels(g, layout, labels=node_labels)

# 4. Turn off the axis because I know you don't want it
plt.axis('off')

plt.title("Collaboration Network Analysis")

# 6. Save Image
plt.savefig(f"network_AuthorsLabeled.png", bbox_inches='tight', dpi=300)

# 5. Tell matplotlib to show it
plt.show()

## Network that shows Authors and Co-contributor IDs

In [None]:
import matplotlib.pyplot as plt
authors = list(network_df.ID.unique())
co_contributors = list(network_df['Co-ID'].unique())


plt.figure(figsize=(12, 12))

# 1. Create the graph
g = nx.from_pandas_edgelist(network_df, source='Co-ID', target='ID') 

# 2. Create a layout for our nodes 
layout = nx.spring_layout(g,iterations=50)

# 3. Draw the parts we want
# Edges thin and grey
# Co-Contributors small and grey
# Authors sized according to their number of connections
# Authors blue
# Labels for Authors ONLY
# People who are highly connected are a highlighted color

# Go through every Authors name, ask the graph how many
# connections it has. Multiply that by 80 to get the circle size
author_size = [g.degree(author) * 80 for author in authors]
nx.draw_networkx_nodes(g, 
                       layout, 
                       nodelist=authors, 
                       node_size=author_size, # a LIST of sizes, based on g.degree
                       node_color='lightblue')

# Draw EVERYONE
nx.draw_networkx_nodes(g, layout, nodelist=co_contributors, node_color='#cccccc', node_size=100)

# Draw POPULAR PEOPLE
popular_people = [person for person in co_contributors if g.degree(person) > 1]
nx.draw_networkx_nodes(g, layout, nodelist=popular_people, node_color='orange', node_size=100)

nx.draw_networkx_edges(g, layout, width=1, edge_color="#cccccc")

node_labels = dict(zip(authors, authors))
nx.draw_networkx_labels(g, layout, labels=node_labels)
nx.draw_networkx_labels(g, layout, labels=dict(zip(co_contributors,co_contributors)))

# 4. Turn off the axis because I know you don't want it
plt.axis('off')

plt.title("Collaboration Network Analysis")

# 6. Save Image
plt.savefig(f"network_All_Labeled.png", bbox_inches='tight', dpi=300)

# 5. Tell matplotlib to show it
plt.show()