# Exploratory Data Analysis

## Import relevant libraries

In [2]:
import os
import pandas as pd

## Load data

In [5]:
# setup data path
base_path = os.path.join('..', 'datasets')
company_data_path = os.path.join(base_path ,'companies.csv')
financial_long_path = os.path.join(base_path ,'financials__long_term.csv')
financial_short_path = os.path.join(base_path ,'financials__short_term.csv')

In [6]:
# load datasets
company_data = pd.read_csv(company_data_path)
financial_long = pd.read_csv(financial_long_path)
financial_short = pd.read_csv(financial_short_path)

## Explore data

There are 182 companies located in `france`, `germany`, `united kingdom`

In [27]:
company_data.head()

Unnamed: 0,country,name
0,france,accorhotels
1,france,adp aerports de paris
2,france,air france
3,france,air liquide
4,france,axa group


Data in `financial_long` are monthly data from Jan 2014 to Dec 2018  
Balanced panel data: 60 rows each, 181 companies ---- 60*181 = 10,860 rows

In [49]:
financial_long.head()

Unnamed: 0,company_name,country,assets,price,sector,operating,debt_to_assets,age,date
0,adidas,germany,12417.0,82.91,consumer discretionary,6.08,15.08,64,2014-01-01T00:00:00.000Z
1,adidas,germany,12417.0,84.2,consumer discretionary,6.08,15.08,64,2014-02-01T00:00:00.000Z
2,adidas,germany,12417.0,78.49,consumer discretionary,6.08,15.08,64,2014-03-01T00:00:00.000Z
3,adidas,germany,12417.0,76.73,consumer discretionary,6.08,15.08,64,2014-04-01T00:00:00.000Z
4,adidas,germany,12417.0,78.71,consumer discretionary,6.08,15.08,64,2014-05-01T00:00:00.000Z


In [119]:
len(financial_long['date'].unique())

60

In [118]:
len(financial_long['company_name'].unique())

181

Data in `financial_short` are daily data from 4 Jan 2016 to 30 Dec 2016
This is an unbalanced panel data:  
There are companies with 253, 255, 256, 257:  
companies with 253 rows miss date = {'2016-05-02,'2016-05-30,'2016-08-29','2016-12-27'}  
companies with 255 rows miss date = {'2016-05-16, '2016-10-03'}  
two companies with 256 rows: {'maurel et prom': '2016-12-30', 'mersen': 2016-08-01'}

In [96]:
financial_short.head()

Unnamed: 0,company_name,country,assets,date,price,sector,operating,debt_to_assets,age
0,adidas,germany,15176.0,2016-01-04T00:00:00.000Z,87.73,consumer discretionary,7.73,10.66,66
1,adidas,germany,15176.0,2016-01-05T00:00:00.000Z,87.212,consumer discretionary,7.73,10.66,66
2,adidas,germany,15176.0,2016-01-06T00:00:00.000Z,86.07,consumer discretionary,7.73,10.66,66
3,adidas,germany,15176.0,2016-01-07T00:00:00.000Z,84.1,consumer discretionary,7.73,10.66,66
4,adidas,germany,15176.0,2016-01-08T00:00:00.000Z,82.51,consumer discretionary,7.73,10.66,66


In [97]:
company_dict = {}
for company in financial_short.company_name.unique():
    company_dict[company] = len(financial_short[financial_short['company_name'] == company])

In [117]:
pd.Series(company_dict.values()).unique()

array([257, 255, 253, 256])

In [126]:
company_dict.values()

dict_values([257, 257, 255, 255, 255, 255, 255, 255, 255, 257, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 257, 255, 255, 257, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 256, 256, 257, 257, 257, 257, 257])

Companies that exist in `financial_long` but not in `financial_short`

In [15]:
set(financial_long.company_name.unique()) - set(financial_short.company_name.unique())

{'axa group',
 'baywa munich',
 'beiersdorf',
 'euromoney insttutional investors plc',
 'faurecia',
 'king fisher',
 'munich re',
 'rekitt benckise',
 'rightmove',
 'schneider electric',
 'sse plc',
 'victrex group'}