In [2]:
import pandas as pd
import numpy as np


## F3METL

UK Financial Times Stock Exchange (FTSE) 350 Industrial Metals and Mining Index

Steps according to the CRISP-DM process:

### 1.	Business understanding

#### What does the business need?

Insight into the factors that drive the earnings of the companies in the FTSE 350 Metal and Mining

### 2.  Data understanding 

- What data do we have? 

    Stock exchange data for 350 Metal and Mining companies.
    
    Seven fields, indexed by the date:  
        - PX_OPEN: Opening price 
        - PX_HIGH: Daily high 
        - PX_LOW: Daily low 
        - PX_LAST: Last price requested 
        - PX_VOLUME: Number of shares 
        - EBITDA: Earnings before interest, taxes, depreciation, and amortization
        - INDX_GENERAL_EARN: General earnings of the index (?)
    
- Privacy/ethics?  
    No issues as this data is meant to be public.  
       
       
- Is it clean?
    - Missing data 
    - Wrong headers
    - Wrong types


### 3.  Data preparation 


In [3]:
original_data = pd.read_csv('../Datasets/Financial_dataset/F3METL.csv', index_col=False)
original_data

Unnamed: 0,F3METL Index,#NAME?,#NAME?.1,#NAME?.2,#NAME?.3,#NAME?.4,#NAME?.5,#NAME?.6
0,Dates,PX_OPEN,PX_HIGH,PX_LOW,PX_LAST,PX_VOLUME,EBITDA,INDX_GENERAL_EARN
1,#NAME?,3875.6,4140.4,3755.3,4038.1,,,
2,05/01/2000,4038.1,4308.9,3947.8,4188.6,,,
3,06/01/2000,4188.6,4308.9,3971.9,4092.3,,,
4,07/01/2000,4092.3,4212.6,3953.9,4194.6,,,
...,...,...,...,...,...,...,...,...
5668,25/01/2022,6680.75,6802.96,6680.75,6733.66,55741384,1629.32,651.68
5669,26/01/2022,6733.66,6968.01,6733.66,6913.6,55269138,1629.32,651.68
5670,27/01/2022,6913.6,7080.91,6819.51,7025.63,58897876,1629.32,651.68
5671,28/01/2022,7025.63,7061.78,6780.64,6817.66,75509989,1629.32,651.68


In [4]:
# Delete the first row 
df = original_data.drop(labels=1)

# Rename the column headers 
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header

# Delete the index column since the date is the index
df.set_index('Dates', inplace=True)

# Reformat dates
df.index = pd.to_datetime(df.index, format='%d/%m/%Y')

# Change numbers to floats/ints (fields with NaN values are floats by default)
df = df.apply(pd.to_numeric)

df

Unnamed: 0_level_0,PX_OPEN,PX_HIGH,PX_LOW,PX_LAST,PX_VOLUME,EBITDA,INDX_GENERAL_EARN
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-05,4038.10,4308.90,3947.80,4188.60,,,
2000-01-06,4188.60,4308.90,3971.90,4092.30,,,
2000-01-07,4092.30,4212.60,3953.90,4194.60,,,
2000-01-10,4194.60,4212.60,3899.70,3917.70,,,
2000-01-11,3917.70,3923.80,3550.60,3616.80,,,
...,...,...,...,...,...,...,...
2022-01-25,6680.75,6802.96,6680.75,6733.66,55741384.0,1629.32,651.68
2022-01-26,6733.66,6968.01,6733.66,6913.60,55269138.0,1629.32,651.68
2022-01-27,6913.60,7080.91,6819.51,7025.63,58897876.0,1629.32,651.68
2022-01-28,7025.63,7061.78,6780.64,6817.66,75509989.0,1629.32,651.68
