# Exploratory Data Analysis

## Import relevant libraries

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

## Load data

In [3]:
# setup data path
base_path = os.path.join('..', 'datasets')
company_data_path = os.path.join(base_path ,'companies.csv')
financial_long_path = os.path.join(base_path ,'financials__long_term.csv')
financial_short_path = os.path.join(base_path ,'financials__short_term.csv')

In [4]:
# load datasets
company_data = pd.read_csv(company_data_path)
financial_long = pd.read_csv(financial_long_path)
financial_short = pd.read_csv(financial_short_path)

## Explore data

There are 182 companies located in `france`, `germany`, `united kingdom`

In [5]:
company_data.head()

Unnamed: 0,country,name
0,france,accorhotels
1,france,adp aerports de paris
2,france,air france
3,france,air liquide
4,france,axa group


Data in `financial_long` are monthly data from Jan 2014 to Dec 2018  
Balanced panel data: 60 rows each, 181 companies ---- 60*181 = 10,860 rows

In [6]:
financial_long.head()

Unnamed: 0,company_name,country,assets,price,sector,operating,debt_to_assets,age,date
0,adidas,germany,12417.0,82.91,consumer discretionary,6.08,15.08,64,2014-01-01T00:00:00.000Z
1,adidas,germany,12417.0,84.2,consumer discretionary,6.08,15.08,64,2014-02-01T00:00:00.000Z
2,adidas,germany,12417.0,78.49,consumer discretionary,6.08,15.08,64,2014-03-01T00:00:00.000Z
3,adidas,germany,12417.0,76.73,consumer discretionary,6.08,15.08,64,2014-04-01T00:00:00.000Z
4,adidas,germany,12417.0,78.71,consumer discretionary,6.08,15.08,64,2014-05-01T00:00:00.000Z


In [7]:
len(financial_long['date'].unique())

60

In [8]:
len(financial_long['company_name'].unique())

181

Data in `financial_short` are daily data from 4 Jan 2016 to 30 Dec 2016  
This is an unbalanced panel data probably because holidays are different across country :  
There are companies with 253, 255, 256, 257:  
companies with 253 rows miss date = {'2016-05-02,'2016-05-30,'2016-08-29','2016-12-27'} **ALL UK COMPANIES**   
companies with 255 rows miss date = {'2016-05-16, '2016-10-03'} **ALL GE COMPANIES**  
two companies with 256 rows: {'maurel et prom': '2016-12-30', 'mersen': 2016-08-01'} **BOTH FR COMPANIES**  
companies with 257 rows **GE & FR COMPANIES**

In [9]:
financial_short.head()

Unnamed: 0,company_name,country,assets,date,price,sector,operating,debt_to_assets,age
0,adidas,germany,15176.0,2016-01-04T00:00:00.000Z,87.73,consumer discretionary,7.73,10.66,66
1,adidas,germany,15176.0,2016-01-05T00:00:00.000Z,87.212,consumer discretionary,7.73,10.66,66
2,adidas,germany,15176.0,2016-01-06T00:00:00.000Z,86.07,consumer discretionary,7.73,10.66,66
3,adidas,germany,15176.0,2016-01-07T00:00:00.000Z,84.1,consumer discretionary,7.73,10.66,66
4,adidas,germany,15176.0,2016-01-08T00:00:00.000Z,82.51,consumer discretionary,7.73,10.66,66


In [10]:
company_dict = {}
for company in financial_short.company_name.unique():
    company_dict[company] = len(financial_short[financial_short['company_name'] == company])

In [11]:
pd.Series(company_dict.values()).unique()

array([257, 255, 253, 256])

In [12]:
company_dict

{'adidas': 257,
 'allianz': 257,
 'basf': 255,
 'bmw': 255,
 'continental': 255,
 'daimler': 255,
 'deutsche borse': 255,
 'lufthansa': 255,
 'deutsche post': 255,
 'eon': 257,
 'heidelberg cement': 255,
 'henkel': 255,
 'rwe': 255,
 'siemens': 255,
 'thyssenkrupp': 255,
 'volkswagen ag': 255,
 'vonovia': 255,
 'elringklinger': 255,
 'leoni ag': 255,
 'cropenergies': 255,
 'evonik industries': 255,
 'fuchs petrolub': 255,
 'lanxess': 255,
 'wacker chemie': 255,
 'alba se': 257,
 'brenntag': 255,
 'cewe stiftung': 255,
 'indus holding': 257,
 'kion group': 255,
 'mtu': 255,
 'osram': 255,
 'wacker neuson': 255,
 'fielmann': 255,
 'hornbach holding': 255,
 'hugo boss': 255,
 'puma': 255,
 'takkt': 255,
 'aareal bank': 255,
 'fraport ag': 255,
 'hamburger hafrn logistik': 255,
 'deutsche wohnen': 255,
 'dic assets': 255,
 'commerzbank': 255,
 'hannover rueck': 255,
 'suedzucker ag': 255,
 'symrise ag': 255,
 'aurubis': 255,
 'talanx': 255,
 'jkx oil and gas': 253,
 'ophir energy': 253,
 '

In [13]:
comp_with_253 = [key for key, values in company_dict.items() if values == 253]
financial_short[financial_short['company_name'].isin(comp_with_253)]

Unnamed: 0,company_name,country,assets,date,price,sector,operating,debt_to_assets,age
12250,jkx oil and gas,united kingdom,247.0,2016-01-04T00:00:00.000Z,0.37021,energy and materials,-46.48,6.79,23
12251,jkx oil and gas,united kingdom,247.0,2016-01-05T00:00:00.000Z,0.36498,energy and materials,-46.48,6.79,23
12252,jkx oil and gas,united kingdom,247.0,2016-01-06T00:00:00.000Z,0.36026,energy and materials,-46.48,6.79,23
12253,jkx oil and gas,united kingdom,247.0,2016-01-07T00:00:00.000Z,0.35588,energy and materials,-46.48,6.79,23
12254,jkx oil and gas,united kingdom,247.0,2016-01-08T00:00:00.000Z,0.35335,energy and materials,-46.48,6.79,23
...,...,...,...,...,...,...,...,...,...
32991,rpc group,united kingdom,3535.7,2016-12-22T00:00:00.000Z,11.44587,energy and materials,5.84,32.38,25
32992,rpc group,united kingdom,3535.7,2016-12-23T00:00:00.000Z,11.35659,energy and materials,5.84,32.38,25
32993,rpc group,united kingdom,3535.7,2016-12-28T00:00:00.000Z,11.79128,energy and materials,5.84,32.38,25
32994,rpc group,united kingdom,3535.7,2016-12-29T00:00:00.000Z,11.60407,energy and materials,5.84,32.38,25


In [14]:
comp_with_255 = [key for key, values in company_dict.items() if values == 255]
financial_short[financial_short['company_name'].isin(comp_with_255)]

Unnamed: 0,company_name,country,assets,date,price,sector,operating,debt_to_assets,age
514,basf,germany,76496.0,2016-01-04T00:00:00.000Z,67.580,energy and materials,11.17,21.32,151
515,basf,germany,76496.0,2016-01-05T00:00:00.000Z,67.200,energy and materials,11.17,21.32,151
516,basf,germany,76496.0,2016-01-06T00:00:00.000Z,66.080,energy and materials,11.17,21.32,151
517,basf,germany,76496.0,2016-01-07T00:00:00.000Z,64.470,energy and materials,11.17,21.32,151
518,basf,germany,76496.0,2016-01-08T00:00:00.000Z,63.680,energy and materials,11.17,21.32,151
...,...,...,...,...,...,...,...,...,...
12245,talanx,germany,156626.0,2016-12-23T00:00:00.000Z,31.965,financials,7.35,2.23,20
12246,talanx,germany,156626.0,2016-12-27T00:00:00.000Z,32.050,financials,7.35,2.23,20
12247,talanx,germany,156626.0,2016-12-28T00:00:00.000Z,31.910,financials,7.35,2.23,20
12248,talanx,germany,156626.0,2016-12-29T00:00:00.000Z,31.900,financials,7.35,2.23,20


In [15]:
comp_with_256 = [key for key, values in company_dict.items() if values == 256]
financial_short[financial_short['company_name'].isin(comp_with_256)]

Unnamed: 0,company_name,country,assets,date,price,sector,operating,debt_to_assets,age
41477,maurel et prom,france,2396.4,2016-01-04T00:00:00.000Z,2.95,energy and materials,5.27,30.50,175
41478,maurel et prom,france,2396.4,2016-01-05T00:00:00.000Z,2.88,energy and materials,5.27,30.50,175
41479,maurel et prom,france,2396.4,2016-01-06T00:00:00.000Z,2.76,energy and materials,5.27,30.50,175
41480,maurel et prom,france,2396.4,2016-01-07T00:00:00.000Z,2.63,energy and materials,5.27,30.50,175
41481,maurel et prom,france,2396.4,2016-01-08T00:00:00.000Z,2.51,energy and materials,5.27,30.50,175
...,...,...,...,...,...,...,...,...,...
41984,mersen,france,1001.2,2016-12-22T00:00:00.000Z,19.42,industrials,4.06,23.77,79
41985,mersen,france,1001.2,2016-12-23T00:00:00.000Z,19.74,industrials,4.06,23.77,79
41986,mersen,france,1001.2,2016-12-27T00:00:00.000Z,20.20,industrials,4.06,23.77,79
41987,mersen,france,1001.2,2016-12-28T00:00:00.000Z,20.30,industrials,4.06,23.77,79


In [16]:
comp_with_257 = [key for key, values in company_dict.items() if values == 257]
financial_short[financial_short['company_name'].isin(comp_with_257)].groupby('company_name')['country'].agg('unique')

company_name
accorhotels               [france]
adidas                   [germany]
adp aerports de paris     [france]
air france                [france]
air liquide               [france]
alba se                  [germany]
allianz                  [germany]
bnp paribas               [france]
bonduelle                 [france]
bouygues                  [france]
bureau veritas            [france]
carrefour                 [france]
danone                    [france]
edf                       [france]
engie                     [france]
eon                      [germany]
gecina                    [france]
groupe casino             [france]
groupe eurotunnel         [france]
hermes international      [france]
indus holding            [germany]
ipsos                     [france]
kering                    [france]
klepierre                 [france]
l'oreal                   [france]
legrand                   [france]
lvmh                      [france]
michelin                  [france]
nexans 

Companies that exist in `financial_long` but not in `financial_short`  

In [17]:
set(financial_long.company_name.unique()) - set(financial_short.company_name.unique())

{'axa group',
 'baywa munich',
 'beiersdorf',
 'euromoney insttutional investors plc',
 'faurecia',
 'king fisher',
 'munich re',
 'rekitt benckise',
 'rightmove',
 'schneider electric',
 'sse plc',
 'victrex group'}

Companies that exist in `financial_short` but not in `financial_long`

In [18]:
set(financial_short.company_name.unique()) - set(financial_long.company_name.unique())

{'leoni ag'}

In [19]:
financial_long.dtypes

company_name       object
country            object
assets            float64
price             float64
sector             object
operating         float64
debt_to_assets    float64
age                 int64
date               object
dtype: object

## Transform date column into datetime format

In [32]:
# transform date series into datetime format
date = pd.to_datetime(financial_long['date'])
# set the format to Year-Month-Date
financial_long.loc[:, 'date'] = date.dt.strftime('%Y-%m-%d')

# decompose date column
financial_long.loc[:, 'year'] = date.dt.strftime('%Y')
financial_long.loc[:, 'month'] = date.dt.strftime('%m')
financial_long.loc[:, 'day'] = date.dt.strftime('%d')

In [34]:
# transform date series into datetime format
date = pd.to_datetime(financial_short['date'])
# set the format to Year-Month-Date
financial_short.loc[:, 'date'] = date.dt.strftime('%Y-%m-%d')

# decompose date column
financial_short.loc[:, 'year'] = date.dt.strftime('%Y')
financial_short.loc[:, 'month'] = date.dt.strftime('%m')
financial_short.loc[:, 'day'] = date.dt.strftime('%d')


## Rearrange columns

The columns orders are not the same so we decided to rearrange the columns

In [22]:
financial_short = financial_short[financial_long.columns]

In [35]:
financial_short.head()

Unnamed: 0,company_name,country,assets,price,sector,operating,debt_to_assets,age,date,year,month,day
0,adidas,germany,15176.0,87.73,consumer discretionary,7.73,10.66,66,2016-01-04,2016,1,4
1,adidas,germany,15176.0,87.212,consumer discretionary,7.73,10.66,66,2016-01-05,2016,1,5
2,adidas,germany,15176.0,86.07,consumer discretionary,7.73,10.66,66,2016-01-06,2016,1,6
3,adidas,germany,15176.0,84.1,consumer discretionary,7.73,10.66,66,2016-01-07,2016,1,7
4,adidas,germany,15176.0,82.51,consumer discretionary,7.73,10.66,66,2016-01-08,2016,1,8


In [36]:
financial_long.head()

Unnamed: 0,company_name,country,assets,price,sector,operating,debt_to_assets,age,date,year,month,day
0,adidas,germany,12417.0,82.91,consumer discretionary,6.08,15.08,64,2014-01-01,2014,1,1
1,adidas,germany,12417.0,84.2,consumer discretionary,6.08,15.08,64,2014-02-01,2014,2,1
2,adidas,germany,12417.0,78.49,consumer discretionary,6.08,15.08,64,2014-03-01,2014,3,1
3,adidas,germany,12417.0,76.73,consumer discretionary,6.08,15.08,64,2014-04-01,2014,4,1
4,adidas,germany,12417.0,78.71,consumer discretionary,6.08,15.08,64,2014-05-01,2014,5,1
