#  Notebook 2 of MBC topic modeling in BHC. 
***
This notebook is merging individual datasets of MBC companies into one complete dataset 
***

In [1]:
# This is the library of packages used in this model with a comment as to why they are needed

# Turn off pesky warnings
import warnings
warnings.filterwarnings("ignore")

# Need numpy and pandas for easier data manipulation
import pandas as pd
import numpy as np

# In order to save data at intermediate points for Excel pivoting and visualization, need the ability to write data to csv
import csv

from openpyxl import load_workbook

# Need datetime for manipulating date-time data in analysis and filtering
import datetime

# Bring in the copy package in case we need it to copy some lists or dataframes
import copy

# Need matplotlib and seaborne for visualizing the data
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt 

# show all columns when displaying pandas dfs
pd.set_option('display.max_columns', None)

# show a time running when executing long loops
from tqdm import tqdm

# Needed for parralellizing the running of the lemmatization and lda model runs, saves big time when assessing data
from joblib import Parallel, delayed 

# Need to be able to hit urls for some of the data tools
import requests

# Bring in the package for regular expressions for easier data manipulation
import re
import pickle
from tqdm import tqdm

# To enable pretty printing, we need to load pprint
from pprint import pprint

# Import the applicable gensim package components for manipulating data and doing LDA modeling
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# bring in spacy for lemmatization of our corpora
import spacy

# Plotting tools that we will need to use 
import pyLDAvis
import pyLDAvis.gensim_models

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en ... This was needed when first setting up spacy in the environment
nlp = spacy.load('en_core_web_sm')

# Bring in tqdm to track progress of for loops
from tqdm import tqdm

# Turn off pesky warnings by ignoring deprecation warnings
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import os

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes

In [2]:
# Now we will read in one of the downloaded files qand take a look at its structure and determine what data we want to pull out of it.
#file_dir = "/Users/donald.jenkins/Text Mining Ch2 Project Work/Web of Science Downloads 15 Feb/"
filename = "Dataset_1"+".csv"
rec = pd.read_csv(filename, encoding= 'unicode_escape')

# And let's take a look at the data to get a sense of what we have
rec.head()

Unnamed: 0,company,document_type,document_format,title,text
0,Azzly,Who_we_serve,web_page,Addiction Treatment Providers,Medication-Assisted Treatment AZZLY Rize simpl...
1,Azzly,Product_info,web_page,Analytics,An essential step for any business seeking to ...
2,Azzly,Product_info,web_page,Billing/RCM,Billing/RCM MAT labs can click once and add a...
3,Azzly,Product_info,web_page,Capitation,AZZLY Reporting for Capitation Billing Overvie...
4,Azzly,Product_info,web_page,Electronic Health Record,Integrated Care Pathways AZZLY's Integrated Ca...


In [3]:
print("The following files are being read in and added to the dataset, resulting in the following dataset shape:")
for i in range(2,23,1):
    filename = "Dataset_"+str(i)+".csv"
    rec1 = pd.read_csv(filename, encoding= 'unicode_escape')
    rec = rec.append(rec1)
    print(filename, rec.shape)

The following files are being read in and added to the dataset, resulting in the following dataset shape:
Dataset_2.csv (186, 5)
Dataset_3.csv (218, 5)
Dataset_4.csv (291, 5)
Dataset_5.csv (307, 5)
Dataset_6.csv (331, 5)
Dataset_7.csv (371, 5)
Dataset_8.csv (392, 5)
Dataset_9.csv (435, 5)
Dataset_10.csv (447, 5)
Dataset_11.csv (455, 5)
Dataset_12.csv (521, 5)
Dataset_13.csv (546, 5)
Dataset_14.csv (670, 5)
Dataset_15.csv (811, 5)
Dataset_16.csv (912, 5)
Dataset_17.csv (927, 5)
Dataset_18.csv (956, 5)
Dataset_19.csv (1303, 5)
Dataset_20.csv (1427, 5)
Dataset_21.csv (1704, 5)
Dataset_22.csv (1721, 5)


In [4]:
rec.columns

Index(['company', 'document_type', 'document_format', 'title', 'text'], dtype='object')

In [5]:
# Finally, save the data from this pandas dataframe so we don't have to duplicate this step in the future and can share the combined dataset as a single csv

# Set the path and filename with today's date
#files_path_out = "/Users/donald.jenkins/Text Mining Ch2 Project Work/"
#combo_filename = "MBC Dataset Combined " + str(datetime.datetime.today()) + ".csv"

# Save the data to a csv file
rec.to_csv("MBC Dataset Combined.csv", index = False)

In [8]:
# First lets take a look at the breakdown of DOI numbers
print("There are", len(rec.text.unique()), "unique text values out of the", len(rec), "total records\n")
#print("There are", rec.text.isna().value_counts()[1], "records with a blank text\n")
print("The following text numbers are duplicates")
#print(rec.text.value_counts()[rec.text.value_counts() > 1], "\n")

There are 1718 unique text values out of the 1721 total records

The following text numbers are duplicates


In [9]:
# First create a list of the dupe DOI values
dupe_text = list(rec.text.value_counts()[rec.text.value_counts() > 1].index)

# Then create a boolean for all the records that have one of these DOI values
bool0 = [i in dupe_text for i in rec.text]

# Then printout the articles that have duplicate DOIs in sorted order by DOI
rec[bool0].sort_values('text')

Unnamed: 0,company,document_type,document_format,title,text
6,Ksana_health,Announcement,Web_page,Introducing EARS 2.0,Ksana Health is pleased to announce new 2.0 im...
21,Ksana_health,Blog,Web_page,Introducing EARS 2.0,Ksana Health is pleased to announce new 2.0 im...
2,Ksana_health,Product_info,Web_page,4 Ways Continuous Behavioral Health Measuremen...,Stephen Hays is the host of the Stigma Podcast...
22,Ksana_health,Blog,Web_page,Stigma Podcast: Measuring Mental Health with S...,Stephen Hays is the host of the Stigma Podcast...
12,Ksana_health,Announcement,Web_page,Ksana Health receives the University of Oregon...,We are grateful and excited to receive a ventu...
37,Ksana_health,Blog,Web_page,Ksana Health receives the University of Oregon...,We are grateful and excited to receive a ventu...


In [11]:
#Let's take a look at the different Document Types we have in this dataset
rec["document_type"].value_counts()

Blog                 1088
Press_release         190
News                  136
Article                66
Product_info           64
Case_study             30
Announcement           30
Who_we_serve           22
Webinar                22
Company_info           16
White_paper            13
Team                    6
Who _we_serve           5
Research                5
Who_we _serve           4
White Paper             4
Testmonial              4
FAQ                     4
white_paper             3
Podcast                 2
Who we serve            2
Testimonies             2
MBC                     1
Security Features       1
About_us                1
Name: document_type, dtype: int64

In [13]:
# Lets take a look at the Blank DOI by Document Type to see if there is anything obvious
pd.crosstab(rec.text.isna(), rec["document_type"])

document_type,About_us,Announcement,Article,Blog,Case_study,Company_info,FAQ,MBC,News,Podcast,Press_release,Product_info,Research,Security Features,Team,Testimonies,Testmonial,Webinar,White Paper,White_paper,Who _we_serve,Who we serve,Who_we _serve,Who_we_serve,white_paper
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
False,1,30,66,1088,30,16,4,1,136,2,190,64,5,1,6,2,4,22,4,13,5,2,4,22,3
