In [1]:
import numpy as np
import pandas as pd
import json
import gzip
from collections import Counter 
from datetime import datetime as dt

In [2]:
dna = pd.read_json('../data/original/DNA_DATA_FULL.gz', compression='gzip') 

In [3]:
companies = dna[['company_codes', 'company_codes_occur', 'company_codes_about', 'company_codes_lineage', 'company_codes_relevance']]

In [4]:
#Uploading the data dictionary into a dataframe
code_dict = pd.read_csv("../data/original/companies.csv")

In [5]:
validity_col = dna[['company_codes', 'company_codes_occur', 'company_codes_about', 'company_codes_lineage', 'company_codes_relevance', 'body', 'publication_datetime']]
profile = pd.DataFrame({"Validity": np.zeros(7)}).set_index(validity_col.columns)

In [6]:
profile

Unnamed: 0,Validity
company_codes,0.0
company_codes_occur,0.0
company_codes_about,0.0
company_codes_lineage,0.0
company_codes_relevance,0.0
body,0.0
publication_datetime,0.0


In [7]:
#Here is the validity function I will be using
#returns the sum of True and divides by the length of the unique list
def checkValidity(ls, col = code_dict.code.tolist()):
    return sum([code in col for code in ls]) / len(ls)

In [43]:
#Getting the unique company codes
unique_company_codes = set()
for value in companies['company_codes']:
    unique_company_codes.update(value.split(","))

#Convert set back to list
unique_company_codes = list(unique_company_codes)
unique_company_codes = unique_company_codes[1:] #The first element was '', so I didn't include it in the final list
unique_company_codes = [word.upper() for word in unique_company_codes]
print(unique_company_codes[0:10])
print("There are {} unique company codes".format(len(unique_company_codes)))

['EGUSA', 'JDKTI', 'IDIRIN', 'WAWEST', 'ECGIF', 'PHACTA', 'NSLASI', 'GJIPBO', 'CHGUER', 'HORITF']
There are 73688 unique company codes


In [9]:
#Unique companies from company_codes_occur
unique_companies_occur = set()

for value in companies['company_codes_occur']:
    unique_companies_occur.update(value.split(","))

unique_companies_occur = list(unique_companies_occur)
unique_companies_occur = unique_companies_occur[1:]
unique_companies_occur = [word.upper() for word in unique_companies_occur]
print(unique_companies_occur[0:10])
print("There are {} unique companies in unique_companies_occur".format(len(unique_companies_occur))) 

['JDKTI', 'EGUSA', 'WAWEST', 'ECGIF', 'PHACTA', 'NSLASI', 'GJIPBO', 'HORITF', 'CHRMS', 'HININH']
There are 62381 unique companies in unique_companies_occur


In [10]:
#unique companies from company_codes_about
unique_companies_about = set()

for value in companies['company_codes_about']:
    unique_companies_about.update(value.split(","))

unique_companies_about = list(unique_companies_about)
unique_companies_about = unique_companies_about[1:]
unique_companies_about = [word.upper() for word in unique_companies_about]
print(unique_companies_about[0:10])
print("There are {} unique companies in unique_companies_about".format(len(unique_companies_about)))

['WAWEST', 'PHACTA', 'GJIPBO', 'CHGUER', 'HORITF', 'CHRMS', 'HININH', 'CRWCR', 'PCEPT', 'INOGE']
There are 30780 unique companies in unique_companies_about


In [11]:
#unique companies from company_codes_relevance
unique_companies_relevance = set()

for value in companies['company_codes_relevance']:
    unique_companies_relevance.update(value.split(","))

unique_companies_relevance = list(unique_companies_relevance)
unique_companies_relevance = unique_companies_relevance[1:]
unique_companies_relevance = [word.upper() for word in unique_companies_relevance]
print(unique_companies_relevance[0:10])
print("There are {} unique companies in unique_companies_relevance".format(len(unique_companies_relevance)))

['EGUSA', 'JDKTI', 'IDIRIN', 'WAWEST', 'PHACTA', 'NSLASI', 'GJIPBO', 'CHGUER', 'HORITF', 'CHRMS']
There are 66451 unique companies in unique_companies_relevance


In [13]:
#unique companies from company_codes_lineage
unique_companies_lineage = set()

for value in companies['company_codes_lineage']:
    unique_companies_lineage.update(value.split(","))

unique_companies_lineage = list(unique_companies_lineage)
unique_companies_lineage = unique_companies_lineage[1:]

#Convert to uppercase bc data dictionary has all codes in upper case
unique_companies_lineage = [word.upper() for word in unique_companies_lineage]
print(unique_companies_lineage[0:10])
print("There are {} unique companies in unique_companies_lineage".format(len(unique_companies_lineage)))

['MEDSSW', 'PILECS', 'ELVISP', 'TXUTL', 'BRTHIN', 'KMRT', 'DDB', 'STEL', 'AMRONL', 'GREAR']
There are 3467 unique companies in unique_companies_lineage


In [45]:
profile.iloc[0] = checkValidity(unique_company_codes)

In [39]:
profile.iloc[1] = checkValidity(unique_companies_occur)

In [34]:
profile.iloc[2] = checkValidity(unique_companies_about)

KeyboardInterrupt: 

In [50]:
profile.iloc[3] = checkValidity(unique_companies_lineage)

In [49]:
profile.iloc[4] = checkValidity(unique_companies_relevance)

KeyboardInterrupt: 

In [53]:
dates = dna['publication_datetime']

In [54]:
#Transforms the date from timestamp to a string 
dates_all_iso = []

for date in dates:
     dates_all_iso.append(dt.fromtimestamp(date/1000.0).strftime('%Y'))

In [55]:
valid_date_all_df = pd.DataFrame()
valid_date_all_df['Date'] = dates_all_iso

In [56]:
#years should be 2010 and onwards. We realllly expect to see 2013-2018 though
validity_date_all = list((valid_date_all_df['Date'] > '2009'))

valid_date_all_df['Validity'] = validity_date_all

In [57]:
print('{}%, or'.format(round(sum(valid_date_all_df['Validity'] == True) / len(valid_date_all_df['Validity']) * 100)), '{} of the data in modification datetime are valid'.format(sum(valid_date_all_df['Validity'] == True)))

100%, or 1942855 of the data in modification datetime are valid


In [59]:
#dropping all the columns except for body
text = dna['body'].fillna("Nothing")

In [60]:
#Gathering the word count for each row
word_count_all = []

for words in text:
   word_count_all.append(int(len(words.split())))

In [61]:
validity_body_all = []

#creates a loop where any text with less than 100 words or more than 10,000 words is considered an invalid 
#data point. The reason for these numbers are that anything less than 100 words does not fit our definition of
#an article and anything longer than 10,000 words is too long for us to check?
for number in word_count_all:
    if number < 100:
        validity_body_all.append(0)
        
    elif number > 10000:
        validity_body_all.append(0)
        
    else:
        validity_body_all.append(1)

In [62]:
#Validity percentage
all_total_valid = sum(validity_body_all)

print('{}%, or'.format(round(all_total_valid/len(validity_body_all)*100)), all_total_valid, 'of the data in body are valid')

78%, or 1520983 of the data in body are valid


In [36]:
profile.iloc[5] = .78286

In [37]:
profile.iloc[6] = 1.00

In [51]:
profile

Unnamed: 0,Validity
company_codes,0.868595
company_codes_occur,0.89165
company_codes_about,0.994087
company_codes_lineage,0.993654
company_codes_relevance,0.860634
body,0.78286
publication_datetime,1.0


In [65]:
#create profiling dataframe
prof = pd.DataFrame({"Completeness": np.zeros(len(dna.columns)).astype(int), "Uniqueness": np.zeros(len(dna.columns)).astype(int),"Duplicates": np.zeros(len(dna.columns)).astype(int)}).set_index(dna.columns)

In [66]:
prof

Unnamed: 0,Completeness,Uniqueness,Duplicates
copyright,0,0,0
subject_codes,0,0,0
art,0,0,0
modification_datetime,0,0,0
body,0,0,0
company_codes_occur,0,0,0
company_codes_about,0,0,0
company_codes_lineage,0,0,0
snippet,0,0,0
publication_date,0,0,0


In [67]:
def findCompleteness(col):
    return sum(~col.isnull()) / len(col)

#NaN not counted as unique
def isUnique(col):
    return (len(col.unique()) - sum(col.isnull().unique())) / (len(col) - sum(col.isnull()))

def checkDuplicates(col):
    return sum(col.duplicated()) / len(col)

In [69]:
#Applying completeness to a df
prof['Completeness'] = dna.apply(findCompleteness)

#Applying the unique function
prof['Uniqueness'] = dna.apply(isUnique)

#Applying the duplicate function
prof['Duplicates'] = dna.apply(checkDuplicates)

KeyboardInterrupt: 