In [1]:
import numpy as np
import pandas as pd
import json
import gzip
from collections import Counter 

In [2]:
df = pd.read_json('../data/original/DNA_DATA_FULL.gz', compression='gzip')

In [4]:
df.columns

Index(['copyright', 'subject_codes', 'art', 'modification_datetime', 'body',
       'company_codes_occur', 'company_codes_about', 'company_codes_lineage',
       'snippet', 'publication_date', 'market_index_codes', 'credit',
       'currency_codes', 'region_of_origin', 'ingestion_datetime',
       'modification_date', 'source_name', 'language_code', 'region_codes',
       'company_codes_association', 'person_codes', 'byline',
       'company_codes_relevance', 'source_code', 'an', 'word_count',
       'company_codes', 'industry_codes', 'title', 'publication_datetime',
       'publisher_name', 'action', 'document_type', 'section', 'dateline'],
      dtype='object')

In [16]:
#Looking only at the company columns
companies = df[['company_codes', 'company_codes_occur', 'company_codes_about', 'company_codes_lineage', 'company_codes_association', 'company_codes_relevance']]

In [17]:
#There are no values in this column so it will not be part of the validating process
print(companies['company_codes_association'].value_counts())
companies = df[['company_codes', 'company_codes_occur', 'company_codes_about', 'company_codes_lineage', 'company_codes_relevance']]

    1942855
Name: company_codes_association, dtype: int64


In [18]:
#For validating, I will be taking each unique company code in all of the columns and checking to see if each one is in the company codes dictionary
#The dataframe below will keep track of the % of valid company codes
profile = pd.DataFrame({"Validity": np.zeros(len(companies.columns))}).set_index(companies.columns)
profile

Unnamed: 0,Validity
company_codes,0.0
company_codes_occur,0.0
company_codes_about,0.0
company_codes_lineage,0.0
company_codes_relevance,0.0


In [33]:
#Here is the validity function I will be using
#For every valid company code in the dataset, num_valid is incremented by 1. At the end that number is divided by all the length
#of the entire unique list to get a decimal
def checkValidity(ls, col):
    num_valid = 0
    for company in ls:
        if company in col.tolist():
            num_valid += 1
    return num_valid / len(ls)

In [40]:
#Getting the unique company codes
unique_company_codes = set()
for value in companies['company_codes']:
    unique_company_codes.update(value.split(","))

#Convert set back to list
unique_company_codes = list(unique_company_codes)
unique_company_codes = unique_company_codes[1:] #The first element was '', so I didn't include it in the final list
print(unique_company_codes[0:10])
print("There are {} unique company codes".format(len(unique_company_codes)))

['uvbgdy', 'conun', 'nrph', 'rsmnnd', 'vrtj', 'vvvit', 'cncn', 'nstrci', 'piagci', 'fndcll']
There are 73688 unique company codes


In [41]:
#Unique companies from company_codes_occur
unique_companies_occur = set()

for value in df['company_codes_occur']:
    unique_companies_occur.update(value.split(","))

unique_companies_occur = list(unique_companies_occur)
unique_companies_occur = unique_companies_occur[1:]

print(unique_companies_occur[0:10])
print("There are {} unique companies in unique_companies_occur".format(len(unique_companies_occur))) 

['uvbgdy', 'conun', 'nrph', 'rsmnnd', 'vrtj', 'vvvit', 'cncn', 'nstrci', 'piagci', 'fndcll']
There are 62381 unique companies in unique_companies_occur


In [42]:
#unique companies from company_codes_about
unique_companies_about = set()

for value in df['company_codes_about']:
    unique_companies_about.update(value.split(","))

unique_companies_about = list(unique_companies_about)
unique_companies_about = unique_companies_about[1:]

print(unique_companies_about[0:10])
print("There are {} unique companies in unique_companies_about".format(len(unique_companies_about)))

['uvbgdy', 'conun', 'nrph', 'vrtj', 'cncn', 'nstrci', 'amexpr', 'nsisof', 'amphma', 'wngfrt']
There are 30780 unique companies in unique_companies_about


In [46]:
#unique companies from company_codes_relevance
unique_companies_relevance = set()

for value in df['company_codes_relevance']:
    unique_companies_relevance.update(value.split(","))

unique_companies_relevance = list(unique_companies_relevance)
unique_companies_relevance = unique_companies_relevance[1:]

print(unique_companies_relevance[0:10])
print("There are {} unique companies in unique_companies_relevance".format(len(unique_companies_relevance)))


['uvbgdy', 'conun', 'nrph', 'rsmnnd', 'cncn', 'nstrci', 'piagci', 'fndcll', 'manccm', 'amexpr']
There are 66451 unique companies in unique_companies_relevance


In [52]:
#unique companies from company_codes_lineage
unique_companies_lineage = set()

for value in df['company_codes_lineage']:
    unique_companies_lineage.update(value.split(","))

unique_companies_lineage = list(unique_companies_lineage)
unique_companies_lineage = unique_companies_lineage[1:]
print(unique_companies_lineage[0:10])
print("There are {} unique companies in unique_companies_lineage".format(len(unique_companies_lineage)))

['invtgn', 'cazen', 'hypaqp', 'nxcp', 'ganinc', 'frazco', 'albers', 'rgpcf', 'hunhal', 'ccpllc']
There are 3467 unique companies in unique_companies_lineage
