# 0 - Modules

In [None]:
import pandas as pd
import numpy as np

from mypackage import data_processor
from mypackage import ploter as plt

# 1 - Load raw Data
* companies
* industries
* markets
* balance_sheet
* income

In [None]:
companies,industries,markets,balance_sheet,income = data_processor.load_raw_data()

In [None]:
industries.shape, markets.shape, balance_sheet.shape, income.shape, companies.shape

# 2 - Join raw data

In [None]:
data = data_processor.join_raw_data(industries, markets, balance_sheet, income, companies,verbose=True)

categorical = ['Company Name','Currency','Fiscal Year','Industry','IndustryId',
                'Market Name','MarketId','Fiscal Period','Publish Date','Report Date','Restated Date',
                'Sector','SimFinId','Ticker','key']
numerical = pd.Series(data.columns)     
numerical=numerical[~numerical.isin(categorical)]

# 2 - Data Cleansing

## Check Balance Sheet Consistensy

In [None]:
data_processor.check_unbalanced(data)

In [None]:
data_processor.check_revenue(data)

In [None]:
data = data[(data['Total Assets']==data['Total Liabilities & Equity'])]
data = data[data.Revenue>0]

## Check NaN columns

In [None]:
data_processor.plot_NA(data)

## Check NaN columns after droping some columns

In [None]:
columns_to_drop = ['Research & Development',
'Depreciation & Amortization',
'Net Extraordinary Gains (Losses)',
'Long Term Investments & Receivables',
'Treasury Stock','Abnormal Gains (Losses)','Inventories']

data = data[data.columns[~data.columns.isin(columns_to_drop)]]
data_processor.plot_NA(data)


## Check data availability per year
> I focused on Short Term Debt and Long Term Debt the less available data

In [None]:
data_processor.check_availability_year(data,values='Short Term Debt',col='Company Name')

In [None]:
data_processor.check_availability_year(data,values='Long Term Debt',col='Company Name')

## Check data availability per Company

In [None]:
data_processor.check_availability_company(data,values='Short Term Debt',col='Company Name')

## Create a consistent Data Set

In [None]:
initial_year=2010
data = data_processor.join_raw_data(industries, markets, balance_sheet, income, companies)
print(data.shape)
data=data_processor.get_consistent_data(data.copy(),initial_year = initial_year,values='Short Term Debt')
data=data_processor.get_consistent_data(data.copy(),initial_year = initial_year,values='Long Term Debt')
data.shape

In [None]:
data_processor.check_availability_year(data,values='Short Term Debt',col='Company Name')

In [None]:
data_processor.check_availability_year(data,values='Long Term Debt',col='Company Name')

In [None]:
data_processor.check_availability_company(data,values='Short Term Debt',col='Company Name')

# 6 - Features Engineering - Financial Ratios

>  Financial ratio CFA : https://analystprep.com/blog/financial-ratio-sheet/

In [None]:
data = data_processor.features_engineering(data.copy())

# 8- The graph bellow help answering the following questions:
* How does the industry landscape of usa changed between two dates?
* How does the business contracted betwween two dates?
* What is the main sectors in the the US economy?
* What is the concentration of an industry at a certain date? 

In [None]:
def get_sunburst(view,path,values,branchvalues="total"):
  fig = px.sunburst(view, path=path, values=values,branchvalues=branchvalues)
  fig.show()

In [None]:
view = data[['Fiscal Year','Industry','Sector','Company Name','Revenue']].groupby(['Fiscal Year','Sector','Industry']).agg({'Company Name':['count'],
                                                                                             'Revenue':['sum']})
view=view.reset_index()

view.columns = ['Year','Industry','Sector','Company','Total_Revenue']

path=['Year', 'Industry','Sector','Company']
values='Total_Revenue'
plt.get_sunburst(view[view.Year.isin([2010,2017,2018,2019])],path,values)

## Sector segmentation in 2019 

In [None]:
view = data[['Fiscal Year','Industry','Sector','Company Name','Revenue']]

view.columns = ['Year','Industry','Sector','Company','Revenue']

path=['Year','Sector','Industry']
values='Revenue'
plt.get_sunburst(view[view.Year.isin([2019])],path,values)

In [None]:
plt.show_macro_evolution(data = data.copy(),col = 'Revenue')

## The chart bellow displays the weight of each industry in is sector in 2019

In [None]:
year = 2019
view = data[data['Fiscal Year'].isin([year])][['Fiscal Year','Industry','Sector','Company Name','Revenue']]

view.columns = ['Year','Industry','Sector','Company','Revenue']

view_sector = view.groupby('Sector').agg({'Revenue':'sum'})
view_sector_industry =view.groupby(['Year','Sector','Industry']).agg({'Revenue':'sum'}).dropna()
sector_secgmentation = view_sector_industry.div(view_sector, level='Sector') * 100
sector_secgmentation= sector_secgmentation.reset_index()

path=['Year','Sector','Industry']
values='Revenue'
plt.get_sunburst(sector_secgmentation,path,values)

## The chart bellow displays the weight of each company inside its industry in 2019

In [None]:
year = 2019
view = data[data['Fiscal Year'].isin([year])][['Fiscal Year','Industry','Sector','Company Name','Revenue']]

view.columns = ['Year','Industry','Sector','Company','Revenue']


view_industry = view.groupby('Sector').agg({'Revenue':'sum'})
view_industry_company =view.groupby(['Year','Sector','Company']).agg({'Revenue':'sum'}).dropna()
industry_secgmentation = view_industry_company.div(view_industry, level='Sector') * 100
industry_secgmentation= industry_secgmentation.reset_index()

path=['Year','Sector','Company']
values='Revenue'
plt.get_sunburst(industry_secgmentation,path,values)

## I want to invest in one sector ?
- What sub sector and company should I investigate ?
- Which company should I investigate ?

In [None]:
year = 2019
view = data[data['Fiscal Year'].isin([year])][['Fiscal Year','Industry','Sector','Company Name','roe']]

view.columns = ['Year','Industry','Sector','Company','roe']
view=view[view.roe>0]

view_sector_industry =view.groupby(['Year','Sector']).agg({'roe':'mean'}).dropna()

path=['Year','Sector']
values='roe'
plt.get_sunburst(view_sector_industry.reset_index(),path,values)

In [None]:
plt.show_macro_evolution(data = data[data.Sector!='other'],col = 'roe')

In [None]:
year = 2019
view = data[data['Fiscal Year'].isin([year])][['Fiscal Year','Industry','Sector','Company Name','roe']]

view.columns = ['Year','Industry','Sector','Company','roe']
view=view[view.roe>0]

view_sector_industry =view.groupby(['Year','Sector','Industry']).agg({'roe':'mean'}).dropna()

path=['Year','Sector','Industry']
values='roe'
plt.get_sunburst(view_sector_industry.reset_index(),path,values)

In [None]:
year = 2019
view = data[data['Fiscal Year'].isin([year])][['Fiscal Year','Industry','Sector','Company Name','roe']]

view.columns = ['Year','Industry','Sector','Company','roe']
view=view[view.roe>0]

view_sector_industry =view.groupby(['Year','Sector','Company']).agg({'roe':'mean'}).dropna()

path=['Year','Sector','Company']
values='roe'
plt.get_sunburst(view_sector_industry.reset_index(),path,values)

In [None]:
company = 'KIMBERLY CLARK CORP'#'USA TECHNOLOGIES INC' #'PerkinElmer'
plt.show_micro_evolution(data,company =company,value = 'Net Income',index_value = 'Company Name')


In [None]:
plt.show_micro_evolution(data,company =company,value = 'roe',index_value = 'Company Name')

In [None]:
plt.show_micro_evolution(data,company =company,value = 'Total_Debt',index_value = 'Company Name')

In [None]:
plt.show_micro_evolution(data,company =company,value = 'roa',index_value = 'Company Name')