In [1]:
import numpy as np
import pandas as pd
import json
import gzip
from collections import Counter 

In [2]:
df = pd.read_json('../data/original/DNA_DATA_FULL.gz', compression='gzip')

In [16]:
df.columns

Index(['copyright', 'subject_codes', 'art', 'modification_datetime', 'body',
       'company_codes_occur', 'company_codes_about', 'company_codes_lineage',
       'snippet', 'publication_date', 'market_index_codes', 'credit',
       'currency_codes', 'region_of_origin', 'ingestion_datetime',
       'modification_date', 'source_name', 'language_code', 'region_codes',
       'company_codes_association', 'person_codes', 'byline',
       'company_codes_relevance', 'source_code', 'an', 'word_count',
       'company_codes', 'industry_codes', 'title', 'publication_datetime',
       'publisher_name', 'action', 'document_type', 'section', 'dateline'],
      dtype='object')

In [8]:
#Looking only at the company columns
companies = df[['company_codes', 'company_codes_occur', 'company_codes_about', 'company_codes_lineage', 'company_codes_association', 'company_codes_relevance']]

In [9]:
#There are no values in this column so it will not be part of the validating process
print(companies['company_codes_association'].value_counts())
companies = df[['company_codes', 'company_codes_occur', 'company_codes_about', 'company_codes_lineage', 'company_codes_relevance']]

    1942855
Name: company_codes_association, dtype: int64


In [10]:
#For validating, I will be taking each unique company code in all of the columns and checking to see if each one is in the company codes dictionary
#The dataframe below will keep track of the % of valid company codes
profile = pd.DataFrame({"Validity": np.zeros(len(companies.columns))}).set_index(companies.columns)
profile

Unnamed: 0,Validity
company_codes,0.0
company_codes_occur,0.0
company_codes_about,0.0
company_codes_lineage,0.0
company_codes_relevance,0.0


In [11]:
#Here is the validity function I will be using
#For every valid company code in the dataset, num_valid is incremented by 1. At the end that number is divided by all the length
#of the entire unique list to get a decimal
def checkValidity(ls, col = code_dict.code.tolist()):
    return sum([code in col for code in ls]) / len(ls)

In [69]:
#Getting the unique company codes
unique_company_codes = set()
for value in companies['company_codes']:
    unique_company_codes.update(value.split(","))

#Convert set back to list
unique_company_codes = list(unique_company_codes)
unique_company_codes = unique_company_codes[1:] #The first element was '', so I didn't include it in the final list
unique_company_codes = [word.upper() for word in unique_company_codes]
print(unique_company_codes[0:10])
print("There are {} unique company codes".format(len(unique_company_codes)))

['CDRL', 'RKIMSY', 'BDORF', 'CESUKL', 'SKLJEQ', 'JAPLLC', 'BIONZU', 'UWJYAB', 'CRSUPZ', 'AUBSPL']
There are 73688 unique company codes


In [73]:
#Unique companies from company_codes_occur
unique_companies_occur = set()

for value in df['company_codes_occur']:
    unique_companies_occur.update(value.split(","))

unique_companies_occur = list(unique_companies_occur)
unique_companies_occur = unique_companies_occur[1:]
unique_companies_occur = [word.upper() for word in unique_companies_occur]
print(unique_companies_occur[0:10])
print("There are {} unique companies in unique_companies_occur".format(len(unique_companies_occur))) 

['BDORF', 'CESUKL', 'SKLJEQ', 'JAPLLC', 'BIONZU', 'UWJYAB', 'CRSUPZ', 'AUBSPL', 'NTADSC', 'FLAFRA']
There are 62381 unique companies in unique_companies_occur


In [74]:
#unique companies from company_codes_about
unique_companies_about = set()

for value in df['company_codes_about']:
    unique_companies_about.update(value.split(","))

unique_companies_about = list(unique_companies_about)
unique_companies_about = unique_companies_about[1:]
unique_companies_about = [word.upper() for word in unique_companies_about]
print(unique_companies_about[0:10])
print("There are {} unique companies in unique_companies_about".format(len(unique_companies_about)))

['SKLJEQ', 'BIONZU', 'CRSUPZ', 'IPENIP', 'NTADSC', 'FLAFRA', 'PRZACL', 'RALDRG', 'GZMMOL', 'CIRPA']
There are 30780 unique companies in unique_companies_about


In [46]:
#unique companies from company_codes_relevance
unique_companies_relevance = set()

for value in df['company_codes_relevance']:
    unique_companies_relevance.update(value.split(","))

unique_companies_relevance = list(unique_companies_relevance)
unique_companies_relevance = unique_companies_relevance[1:]

print(unique_companies_relevance[0:10])
print("There are {} unique companies in unique_companies_relevance".format(len(unique_companies_relevance)))


['uvbgdy', 'conun', 'nrph', 'rsmnnd', 'cncn', 'nstrci', 'piagci', 'fndcll', 'manccm', 'amexpr']
There are 66451 unique companies in unique_companies_relevance


In [53]:
#unique companies from company_codes_lineage
unique_companies_lineage = set()

for value in df['company_codes_lineage']:
    unique_companies_lineage.update(value.split(","))

unique_companies_lineage = list(unique_companies_lineage)
unique_companies_lineage = unique_companies_lineage[1:]

#Convert to uppercase bc data dictionary has all codes in upper case
unique_companies_lineage = [word.upper() for word in unique_companies_lineage]
print(unique_companies_lineage[0:10])
print("There are {} unique companies in unique_companies_lineage".format(len(unique_companies_lineage)))

['CHNLIB', 'KTZEN', 'DYNTL', 'ISTHMA', 'BXRGRP', 'ZFPZUR', 'UNNATF', 'RCNGYI', 'LEETH', 'SASILI']
There are 3467 unique companies in unique_companies_lineage


In [20]:
#Uploading the data dictionary into a dataframe
code_dict = pd.read_csv("../data/original/Company Codes Dictionary.csv")

In [60]:
#print(checkValidity(unique_companies_lineage))
profile.iloc[3] = checkValidity(unique_companies_lineage)

In [70]:
profile.iloc[0] = checkValidity(unique_company_codes)

In [75]:
profile.iloc[2] = checkValidity(unique_companies_about)

In [83]:
profile.iloc[1] = checkValidity(unique_companies_occur)

In [72]:
code_dict.tail()

Unnamed: 0,code,description
1048570,KWGMA,Kawagoeshi Ishikai General Inc. Association
1048571,KWGNG,Kondor Wessels Grundstücksverwaltung NRW GmbH
1048572,KWGNII,Innodel SA
1048573,KWGOL,Kwik Goal
1048574,KWGOL,Kwik Goal


In [84]:
profile

Unnamed: 0,Validity
company_codes,0.445799
company_codes_occur,0.45889
company_codes_about,0.508415
company_codes_lineage,0.516873
company_codes_relevance,0.0


In [78]:
code_dict.tail()

Unnamed: 0,code,description
1048570,KWGMA,Kawagoeshi Ishikai General Inc. Association
1048571,KWGNG,Kondor Wessels Grundstücksverwaltung NRW GmbH
1048572,KWGNII,Innodel SA
1048573,KWGOL,Kwik Goal
1048574,KWGOL,Kwik Goal


In [82]:
code_dict.shape

(1048575, 2)