In [1]:
import numpy as np
import pandas as pd

df = pd.read_excel('data/SOFT-3.xlsx')

In [2]:
#Install neccessary dependencies
#!python -m pip install -U faker

In [3]:
from faker import Faker

faker = Faker()# Test fake data generation
print("The Faker library can generate fake names. By running 'faker.name()', we get:")
faker.name()

Faker.seed(4321)
dict_names = {name: faker.name() for name in df['student'].unique()}
df['student'] = df['student'].map(dict_names)
df.head(20)

The Faker library can generate fake names. By running 'faker.name()', we get:


Unnamed: 0,student,company,cvr
0,Jason Brown,Dynatest A/S,40916458
1,Jacob Stein,"Eriksholm Research Centre, Oticon",42334219
2,Cody Brown,Formpipe,29177015
3,Larry Morales,Formpipe,29177015
4,Jessica Hendricks,Formpipe,29177015
5,Brian Moore,Novo Nordisk,24256790
6,Scott Baker,PFA,13594376
7,Ruth Hoffman,Topdanmark,78040017
8,Daniel George,KMD A/S,26911745
9,David Moody,KMD A/S,26911745


In [4]:
#Method that fetches the CVR API using the companies CVR number.

import urllib.request as request
import json 
import contextlib

def cvrapi(cvr):
  request_a = request.Request(
    url='https://cvrapi.dk/api?country=dk&vat=%s' % cvr,
    headers={
      'User-Agent': 'ds-assignment'})
  with contextlib.closing(request.urlopen(request_a)) as response:
    return json.loads(response.read())

In [5]:
#Put all companies CVR in a list of strings
companies_cvr = df['cvr'].tolist()

#Remove duplicates
unique_cvr = [cvr for cvr in set(companies_cvr)]

print(unique_cvr)

[26911745, 13594376, 40916458, 42334219, 78040017, 10121361, 14814833, 24256790, 29177015, 27506488, 61966617, 40075291]


In [6]:
#Call API for each CVR and write response json to a json file

import json

companies_list = []


#for c in unique_cvr:
#    company_json = (cvrapi(c))
#    companies_list.append(company_json)
#
#with open("data/companies.json", "w") as file:
#    json.dump(companies_list, file)

In [7]:
#Insert JSON data from file to pandas dataframe
json_df = pd.read_json('data/companies.json')

json_df.head(11)

Unnamed: 0,vat,name,address,zipcode,city,cityname,protected,phone,email,fax,...,industrydesc,companycode,companydesc,creditstartdate,creditbankrupt,creditstatus,owners,productionunits,t,version
0,26911745,KMD A/S,Lautrupparken 40,2750,Ballerup,,False,44601000.0,info@kmd.dk,44604106.0,...,Konsulentbistand vedrørende informationsteknologi,60,Aktieselskab,,False,,,"[{'pno': 1017630322, 'main': False, 'name': 'K...",100,6
1,13594376,"PFA PENSION, FORSIKRINGSAKTIESELSKAB.",Sundkrogsgade 4,2100,København Ø,,False,39175000.0,,,...,Livsforsikring,60,Aktieselskab,,False,,,"[{'pno': 1000591581, 'main': False, 'name': 'P...",100,6
2,40916458,Dynatest A/S,Tempovej 27,2750,Ballerup,,False,70253355.0,,,...,"Fremstilling af udstyr til måling, afprøvning,...",60,Aktieselskab,,False,,,"[{'pno': 1025210030, 'main': True, 'name': 'Dy...",100,6
3,42334219,OTICON A/S,Kongebakken 9,2765,Smørum,,False,39177100.0,,39277900.0,...,Fremstilling af høreapparater og dele hertil,60,Aktieselskab,,False,,,"[{'pno': 1006552453, 'main': False, 'name': 'O...",100,6
4,78040017,TOPDANMARK A/S,Borupvang 4,2750,Ballerup,,False,70158585.0,,,...,Finansielle holdingselskaber,60,Aktieselskab,,False,,[{'name': 'Sampo plc'}],"[{'pno': 1015253246, 'main': False, 'name': 'T...",100,6
5,10121361,NORTHTECH ApS,"Wildersgade 51, 1",1408,København K,,True,,kontakt@northtech.dk,,...,Computerprogrammering,80,Anpartsselskab,,False,,,"[{'pno': 1009658463, 'main': True, 'name': 'NO...",100,6
6,14814833,Netcompany A/S,"Grønningen 17, 1",1270,København K,,True,70131440.0,info@netcompany.com,,...,Konsulentbistand vedrørende informationsteknologi,60,Aktieselskab,,False,,,"[{'pno': 1017633488, 'main': False, 'name': 'N...",100,6
7,24256790,NOVO NORDISK A/S,Novo Alle 1,2880,Bagsværd,,False,44448888.0,,,...,Fremstilling af farmaceutiske præparater,60,Aktieselskab,,False,,,"[{'pno': 1017661031, 'main': False, 'name': 'N...",100,6
8,29177015,FORMPIPE SOFTWARE A/S,Lautrupvang 1,2750,Ballerup,,False,33256555.0,,,...,Computerprogrammering,60,Aktieselskab,,False,,[{'name': 'Formpipe Software AB'}],"[{'pno': 1014492123, 'main': False, 'name': 'F...",100,6
9,27506488,ALPHA SOLUTIONS A/S,"Gothersgade 14, 4",1123,København K,,True,70206538.0,,,...,Konsulentbistand vedrørende informationsteknologi,60,Aktieselskab,,False,,,"[{'pno': 1010322290, 'main': True, 'name': 'AL...",100,6


In [8]:
#Install geopy
#!python -m pip install -U geopy

In [9]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="geoapiExercises")

company_cities = json_df['city'].tolist()
company_addresses = json_df['address'].tolist()

addresses_stripped = [address.split(',')[0] for address in company_addresses]

addresses = []
for i in range(len(addresses_stripped)):
    addresses.append(addresses_stripped[i] + ", " + company_cities[i])

latitudes = []
longitudes = []
for address in addresses:
    location = geolocator.geocode(address)
    latitudes.append((location.latitude))
    longitudes.append((location.longitude))

print(latitudes)
print(longitudes)

[55.73613, 55.708655, 55.723369, 55.749182, 55.734142, 55.674321, 55.689618, 55.754555, 55.733204, 55.682038, 57.4455297, 55.653577]
[12.393105, 12.591288, 12.376387, 12.294416, 12.380579, 12.591612, 12.590014, 12.455185, 12.391164, 12.583964, 10.4937435, 12.544796]


In [10]:
#Add latitude and longitude to the Dataframe 'json_df'

#Latitudes
json_df = json_df.assign(latitude=latitudes)

#Longitudes
json_df = json_df.assign(longitude=longitudes)

json_df.head(10)

json_df.to_csv('data/PS1.csv', index=False) # set index=False to exclude index column in the saved file

In [11]:
#Clean dataframe

clean_df = json_df[['vat', 'name', 'city', 'address', 'zipcode', 'latitude', 'longitude']]

clean_df.head(20)

Unnamed: 0,vat,name,city,address,zipcode,latitude,longitude
0,26911745,KMD A/S,Ballerup,Lautrupparken 40,2750,55.73613,12.393105
1,13594376,"PFA PENSION, FORSIKRINGSAKTIESELSKAB.",København Ø,Sundkrogsgade 4,2100,55.708655,12.591288
2,40916458,Dynatest A/S,Ballerup,Tempovej 27,2750,55.723369,12.376387
3,42334219,OTICON A/S,Smørum,Kongebakken 9,2765,55.749182,12.294416
4,78040017,TOPDANMARK A/S,Ballerup,Borupvang 4,2750,55.734142,12.380579
5,10121361,NORTHTECH ApS,København K,"Wildersgade 51, 1",1408,55.674321,12.591612
6,14814833,Netcompany A/S,København K,"Grønningen 17, 1",1270,55.689618,12.590014
7,24256790,NOVO NORDISK A/S,Bagsværd,Novo Alle 1,2880,55.754555,12.455185
8,29177015,FORMPIPE SOFTWARE A/S,Ballerup,Lautrupvang 1,2750,55.733204,12.391164
9,27506488,ALPHA SOLUTIONS A/S,København K,"Gothersgade 14, 4",1123,55.682038,12.583964


In [12]:
#Install folium
#!python -m pip install -U folium

In [13]:
import folium

# Create a base map centered on Denmark
denmark_map = folium.Map(location=[55.67, 12.57], zoom_start=7)

#Add markers to where students have been in internship
for index, row in clean_df.iterrows():
    folium.Marker([row["latitude"], row["longitude"]], popup=row["name"]).add_to(denmark_map)

# Show the map
denmark_map

In [14]:
#Save cleaned dataframe to CSV file
clean_df.to_csv("data/cleaned.csv", index=False)

In [15]:
import smtplib
import ssl
from email.message import EmailMessage

email_sender = 'cphfa116@gmail.com'
email_password = '***********'
email_receiver = 'tdi@cphbusiness.dk'

subject = 'Data Science Assignment - Automated Email Frederik'
body = """
This is an automated email sent from my python application in the data science course
"""

em = EmailMessage()
em['From'] = email_sender
em['To'] = email_receiver
em['Subject'] = subject
em.set_content(body)

context = ssl.create_default_context()

with smtplib.SMTP_SSL('smtp.gmail.com', 465, context=context) as smtp:
    smtp.login(email_sender, email_password)
    smtp.sendmail(email_sender, email_receiver, em.as_string())

SMTPAuthenticationError: (535, b'5.7.8 Username and Password not accepted. Learn more at\n5.7.8  https://support.google.com/mail/?p=BadCredentials p3-20020a19f003000000b004cb015794a8sm238300lfc.109 - gsmtp')