# Raw Materials

## Libraries

In [None]:
import pandas as pd
import numpy as np

## DataSets

In [None]:
cols = ['Fecha', 'Último', 'Apertura', 'Máximo', 'Mínimo', 'Vol.']

In [None]:
df_gas = pd.read_csv('Datos históricos Futuros gas natural.csv', usecols = cols)
df_PetBrent = pd.read_csv('Datos históricos Futuros petróleo Brent.csv', usecols = cols)
df_Petcrude = pd.read_csv('Datos históricos Futuros petróleo crudo WTI.csv', usecols = cols)
df_Ur = pd.read_csv('Datos históricos Futuros uranio.csv', usecols = cols)
df_Coal = pd.read_excel('Coal_12_31_21-01_02_23 (CORREGIDO).xlsx')

In [None]:
df_Coal = df_Coal[['Date', 'Close', 'Open', 'High', 'Low']]
df_Coal = df_Coal.rename(columns={'Date':'Fecha'})

In [None]:
DataSets = [df_gas, df_PetBrent, df_Petcrude, df_Ur, df_Coal]
Column_names = ['Gas_Price', 'Gas_Open', 'Gas_High', 'Gas_Low', 'Gas_Vol.', 'PetBr_Price', 'PetBr_Open', 'PetBr_High', 'PetBr_Low', 'PetBr_Vol.', 'PetCr_Price', 'PetCr_Open', 'PetCr_High', 'PetCr_Low', 'PetCr_Vol.', 'Ur_Price', 'Ur_Open', 'Ur_High', 'Ur_Low', 'Ur_Vol.', 'Coal_Price', 'Coal_Open', 'Coal_High', 'Coal_Low']

In [None]:
# dates to datetime.
for i in DataSets:
    i['Fecha'] = pd.to_datetime(i['Fecha'], format = '%d.%m.%Y')

## Concat with DateTime index.

In [None]:
# Concat with datetime index.
dfs = [i.set_index('Fecha') for i in DataSets]
df = pd.concat(dfs, axis = 1)

In [None]:
df.columns = Column_names

In [None]:
df_Ur

Unnamed: 0,Fecha,Último,Apertura,Máximo,Mínimo,Vol.
0,2023-02-27,5105,5105,5105,5105,
1,2023-02-24,5170,5170,5170,5170,
2,2023-02-23,5180,5180,5180,5180,
3,2023-02-22,5180,5180,5180,5180,
4,2023-02-21,5180,5180,5180,5180,
...,...,...,...,...,...,...
455,2020-12-18,3010,3010,3010,3010,
456,2020-12-17,3010,3010,3010,3010,
457,2020-12-16,3010,3010,3010,3010,
458,2020-12-15,3010,3010,3010,3010,


## Dates range

In [None]:
df = df.reset_index()

In [None]:
df['Fecha'] = pd.to_datetime(df['Fecha'], dayfirst=True)

min_date = df['Fecha'].min()
max_date = df['Fecha'].max()

full_date_range = pd.date_range(start=min_date, end=max_date, freq='D')

missing_dates = full_date_range.difference(df['Fecha'])

print("Missing dates:", missing_dates)

Missing dates: DatetimeIndex(['2020-12-19', '2020-12-20', '2020-12-25', '2020-12-26',
               '2020-12-27', '2021-01-01', '2021-01-02', '2021-01-03',
               '2021-01-09', '2021-01-10',
               ...
               '2023-05-20', '2023-05-21', '2023-05-27', '2023-06-03',
               '2023-06-04', '2023-06-10', '2023-06-11', '2023-06-17',
               '2023-06-24', '2023-06-25'],
              dtype='datetime64[ns]', length=252, freq=None)


In [None]:
date_df = pd.DataFrame({'Fecha': full_date_range})
merged_df = pd.merge(date_df, df, how='outer', on='Fecha')

In [None]:
merged_df['Fecha'] = pd.to_datetime(merged_df['Fecha'], dayfirst=True)

min_date = merged_df['Fecha'].min()
max_date = merged_df['Fecha'].max()

full_date_range = pd.date_range(start=min_date, end=max_date, freq='D')

missing_dates = full_date_range.difference(merged_df['Fecha'])

print("Missing dates:", missing_dates)

Missing dates: DatetimeIndex([], dtype='datetime64[ns]', freq='D')


In [None]:
merged_df = merged_df.sort_values(by='Fecha').reset_index(drop=True)

In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929 entries, 0 to 928
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Fecha        929 non-null    datetime64[ns]
 1   Gas_Price    662 non-null    object        
 2   Gas_Open     662 non-null    object        
 3   Gas_High     662 non-null    object        
 4   Gas_Low      662 non-null    object        
 5   Gas_Vol.     624 non-null    object        
 6   PetBr_Price  646 non-null    object        
 7   PetBr_Open   646 non-null    object        
 8   PetBr_High   646 non-null    object        
 9   PetBr_Low    646 non-null    object        
 10  PetBr_Vol.   644 non-null    object        
 11  PetCr_Price  664 non-null    object        
 12  PetCr_Open   664 non-null    object        
 13  PetCr_High   664 non-null    object        
 14  PetCr_Low    664 non-null    object        
 15  PetCr_Vol.   604 non-null    object        
 16  Ur_Price

In [None]:
duplicate_dates_mask = merged_df.duplicated(subset=['Fecha'], keep=False)
duplicate_dates = merged_df[duplicate_dates_mask]
print(duplicate_dates)

Empty DataFrame
Columns: [Fecha, Gas_Price, Gas_Open, Gas_High, Gas_Low, Gas_Vol., PetBr_Price, PetBr_Open, PetBr_High, PetBr_Low, PetBr_Vol., PetCr_Price, PetCr_Open, PetCr_High, PetCr_Low, PetCr_Vol., Ur_Price, Ur_Open, Ur_High, Ur_Low, Ur_Vol., Coal_Price, Coal_Open, Coal_High, Coal_Low]
Index: []

[0 rows x 25 columns]


In [None]:
print(merged_df['Fecha'].min())
print(merged_df['Fecha'].max())

2020-12-14 00:00:00
2023-06-30 00:00:00


## DataFrame Final (Cleaning).

In [None]:
start_date = pd.Timestamp('2020-12-15 00:00:00')
end_date = pd.Timestamp('2023-05-01 00:00:00')
df_final = merged_df[(merged_df['Fecha'] >= start_date) & (merged_df['Fecha'] <= end_date)]

In [None]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 868 entries, 1 to 868
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Fecha        868 non-null    datetime64[ns]
 1   Gas_Price    616 non-null    object        
 2   Gas_Open     616 non-null    object        
 3   Gas_High     616 non-null    object        
 4   Gas_Low      616 non-null    object        
 5   Gas_Vol.     583 non-null    object        
 6   PetBr_Price  602 non-null    object        
 7   PetBr_Open   602 non-null    object        
 8   PetBr_High   602 non-null    object        
 9   PetBr_Low    602 non-null    object        
 10  PetBr_Vol.   601 non-null    object        
 11  PetCr_Price  618 non-null    object        
 12  PetCr_Open   618 non-null    object        
 13  PetCr_High   618 non-null    object        
 14  PetCr_Low    618 non-null    object        
 15  PetCr_Vol.   564 non-null    object        
 16  Ur_Price

In [None]:
df_final['Gas_Vol.']

1          NaN
2          NaN
3          NaN
4          NaN
5          NaN
        ...   
864    105,22K
865    176,42K
866        NaN
867        NaN
868    108,99K
Name: Gas_Vol., Length: 868, dtype: object

In [None]:
#Cleaning numbers.
def convert_to_float(x):
    if isinstance(x, str):
        return pd.to_numeric(x.replace(',', '.'), errors='coerce')
    return pd.to_numeric(x, errors='coerce')

# Cleaning volumes columns.
def clean_and_convert_volume(x):
    if isinstance(x, str):
        x = x.replace(',', '.').replace('K', '')
        return pd.to_numeric(x, errors='coerce') * 1000
    return x

In [None]:
columns_to_convert = [
    'Gas_Price', 'Gas_Open', 'Gas_High', 'Gas_Low',
    'PetBr_Price', 'PetBr_Open', 'PetBr_High', 'PetBr_Low',
    'PetCr_Price', 'PetCr_Open', 'PetCr_High', 'PetCr_Low',
    'Ur_Price', 'Ur_Open', 'Ur_High', 'Ur_Low',
]

columns_to_convert_vol = ['PetCr_Vol.', 'Ur_Vol.', 'PetBr_Vol.', 'Gas_Vol.']

In [None]:
for column in columns_to_convert:
    df_final[column] = df_final[column].apply(convert_to_float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final[column] = df_final[column].apply(convert_to_float)


In [None]:
for column in columns_to_convert_vol:
    df_final[column] = df_final[column].apply(clean_and_convert_volume)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final[column] = df_final[column].apply(clean_and_convert_volume)


In [None]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 868 entries, 1 to 868
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Fecha        868 non-null    datetime64[ns]
 1   Gas_Price    616 non-null    float64       
 2   Gas_Open     616 non-null    float64       
 3   Gas_High     616 non-null    float64       
 4   Gas_Low      616 non-null    float64       
 5   Gas_Vol.     583 non-null    float64       
 6   PetBr_Price  602 non-null    float64       
 7   PetBr_Open   602 non-null    float64       
 8   PetBr_High   602 non-null    float64       
 9   PetBr_Low    602 non-null    float64       
 10  PetBr_Vol.   601 non-null    float64       
 11  PetCr_Price  618 non-null    float64       
 12  PetCr_Open   618 non-null    float64       
 13  PetCr_High   618 non-null    float64       
 14  PetCr_Low    618 non-null    float64       
 15  PetCr_Vol.   564 non-null    float64       
 16  Ur_Price

In [None]:
df_gas

Unnamed: 0,Fecha,Último,Apertura,Máximo,Mínimo,Vol.
0,2023-06-30,2798,2675,2825,2640,
1,2023-06-29,2701,2662,2746,2616,"118,61K"
2,2023-06-28,2603,2788,2839,2593,"1,32K"
3,2023-06-27,2763,2777,2809,2724,"71,15K"
4,2023-06-26,2791,2758,2816,2720,"31,60K"
...,...,...,...,...,...,...
657,2021-01-07,2729,2722,2755,2663,"135,48K"
658,2021-01-06,2716,2688,2770,2606,"144,69K"
659,2021-01-05,2702,2599,2732,2591,"149,83K"
660,2021-01-04,2581,2626,2670,2566,"128,38K"


## Missing Values

In [None]:
df_final

Unnamed: 0,Fecha,Gas_Price,Gas_Open,Gas_High,Gas_Low,Gas_Vol.,PetBr_Price,PetBr_Open,PetBr_High,PetBr_Low,...,PetCr_Vol.,Ur_Price,Ur_Open,Ur_High,Ur_Low,Ur_Vol.,Coal_Price,Coal_Open,Coal_High,Coal_Low
1,2020-12-15,,,,,,,,,,...,,30.1,30.1,30.1,30.1,,,,,
2,2020-12-16,,,,,,,,,,...,,30.1,30.1,30.1,30.1,,,,,
3,2020-12-17,,,,,,,,,,...,,30.1,30.1,30.1,30.1,,,,,
4,2020-12-18,,,,,,,,,,...,,30.1,30.1,30.1,30.1,,,,,
5,2020-12-19,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
864,2023-04-27,2.355,2.314,2.376,2.268,105220.0,78.37,77.77,78.61,77.39,...,345880.0,,,,,,140.0,0.0,0.0,0.0
865,2023-04-28,2.410,2.358,2.529,2.285,176420.0,79.54,78.25,79.61,77.69,...,328400.0,,,,,,136.4,0.0,0.0,0.0
866,2023-04-29,,,,,,,,,,...,,,,,,,,,,
867,2023-04-30,,,,,,,,,,...,,,,,,,,,,


In [None]:
df_final.set_index('Fecha', inplace=True)

In [None]:
nan_rows = df_final[df_final.isnull().all(axis=1)]
nan_groups = nan_rows.groupby(pd.Grouper(freq='M')).size()
print(nan_groups)

Fecha
2020-12-31     5
2021-01-31    10
2021-02-28     7
2021-03-31     8
2021-04-30     9
2021-05-31     9
2021-06-30     8
2021-07-31     8
2021-08-31     9
2021-09-30     7
2021-10-31    10
2021-11-30     8
2021-12-31     8
2022-01-31     9
2022-02-28     7
2022-03-31     8
2022-04-30    10
2022-05-31     8
2022-06-30     7
2022-07-31     9
2022-08-31     8
2022-09-30     7
2022-10-31    10
2022-11-30     8
2022-12-31     7
2023-01-31     8
2023-02-28     7
2023-03-31     8
2023-04-30    11
Freq: M, dtype: int64


In [None]:
df_final_fill = df_final.fillna(method='ffill', limit=3)
df_final_fill.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 868 entries, 2020-12-15 to 2023-05-01
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Gas_Price    852 non-null    float64
 1   Gas_Open     852 non-null    float64
 2   Gas_High     852 non-null    float64
 3   Gas_Low      852 non-null    float64
 4   Gas_Vol.     852 non-null    float64
 5   PetBr_Price  852 non-null    float64
 6   PetBr_Open   852 non-null    float64
 7   PetBr_High   852 non-null    float64
 8   PetBr_Low    852 non-null    float64
 9   PetBr_Vol.   852 non-null    float64
 10  PetCr_Price  852 non-null    float64
 11  PetCr_Open   852 non-null    float64
 12  PetCr_High   852 non-null    float64
 13  PetCr_Low    852 non-null    float64
 14  PetCr_Vol.   821 non-null    float64
 15  Ur_Price     676 non-null    float64
 16  Ur_Open      676 non-null    float64
 17  Ur_High      676 non-null    float64
 18  Ur_Low       676 non-null    fl

In [None]:
df_Ur.shape

(460, 6)

In [None]:
nan_rows = df_final.loc[df_final['Ur_Price'].isnull(), 'Ur_Price']
nan_groups = nan_rows.groupby(pd.Grouper(freq='M')).size()
print(nan_groups)

Fecha
2020-12-31     5
2021-01-31    12
2021-02-28     9
2021-03-31     8
2021-04-30     9
2021-05-31    11
2021-06-30     8
2021-07-31    10
2021-08-31     9
2021-09-30    12
2021-10-31    31
2021-11-30    30
2021-12-31    31
2022-01-31    25
2022-02-28     9
2022-03-31     8
2022-04-30    10
2022-05-31    23
2022-06-30     9
2022-07-31    11
2022-08-31     8
2022-09-30     9
2022-10-31    10
2022-11-30     9
2022-12-31    10
2023-01-31    11
2023-02-28    10
2023-03-31    31
2023-04-30    30
2023-05-31     1
Freq: M, Name: Ur_Price, dtype: int64


## To_Excel

In [None]:
df_final_fill.head(30)

Unnamed: 0,Fecha,Gas_Price,Gas_Open,Gas_High,Gas_Low,Gas_Vol.,PetBr_Price,PetBr_Open,PetBr_High,PetBr_Low,...,PetCr_Vol.,Ur_Price,Ur_Open,Ur_High,Ur_Low,Ur_Vol.,Coal_Price,Coal_Open,Coal_High,Coal_Low
0,15-12-2020,,,,,,,,,,...,,30.1,30.1,30.1,30.1,,,,,
1,16-12-2020,,,,,,,,,,...,,30.1,30.1,30.1,30.1,,,,,
2,17-12-2020,,,,,,,,,,...,,30.1,30.1,30.1,30.1,,,,,
3,18-12-2020,,,,,,,,,,...,,30.1,30.1,30.1,30.1,,,,,
4,19-12-2020,,,,,,,,,,...,,30.1,30.1,30.1,30.1,,,,,
5,20-12-2020,,,,,,,,,,...,,30.1,30.1,30.1,30.1,,,,,
6,21-12-2020,,,,,,,,,,...,,30.25,30.25,30.25,30.25,,,,,
7,22-12-2020,,,,,,,,,,...,,30.3,30.3,30.3,30.3,,,,,
8,23-12-2020,,,,,,,,,,...,,30.35,30.35,30.35,30.35,,,,,
9,24-12-2020,,,,,,,,,,...,,30.35,30.35,30.35,30.35,,,,,


In [None]:
df_final_fill = df_final_fill.reset_index()
df_final_fill['Fecha'] = df_final_fill['Fecha'].dt.strftime('%d-%m-%Y')

In [None]:
name = 'RawMaterials.xlsx'

x = df_final_fill

x.to_excel(name, index=False)