In [11]:
import pandas as pd
import os

# Custom formatting function
def format_with_periods(x):
    return '{:,.8f}'.format(x).replace(',', '.')

# Read the CSV file
df = pd.read_csv("data/Company_Share_GBO_unit.csv", delimiter=';')

# Drop rows with missing values
df.dropna(inplace=True)

# Convert 'Year_date' column to datetime format
df['Year_date'] = pd.to_datetime(df['Year_date'])

# Convert the date format to 'DD/MM/YYYY'
df['Year_date'] = df['Year_date'].dt.strftime('%d/%m/%Y')

# Convert 'Volume' column to float (replace commas with periods)
df['Volume'] = df['Volume'].str.replace(',', '.').astype(float)

# Columns to round up
columns_to_round_up = ["Location", "Subcategory_ID", "Hierarchy_Level", "Year_text", "Year_minus_2016"]

# Round up the columns
df[columns_to_round_up] = df[columns_to_round_up].round().astype(int)

# Make units and volume the same
for index, row in df.iterrows():
    if row["Unit"] != 'million litres':
        df.at[index, 'Volume'] /= 1000000
        df.at[index, 'Unit'] = 'million litres'

# Round the 'Volume' column to 8 decimal places
df['Volume'] = df['Volume'].round(8)

# Convert each value in the 'Volume' column to a string with commas separating thousands and a comma before the decimal part
df['Volume'] = df['Volume'].apply(format_with_commas)

# Create the directory if it doesn't exist
directory = "clean_data"
if not os.path.exists(directory):
    os.makedirs(directory)

# Save the DataFrame to CSV with commas as decimal separators
df.to_csv(os.path.join(directory, "Company_Share_GBO_unit.csv"), index=False, decimal=',')

  df['Year_date'] = pd.to_datetime(df['Year_date'])


In [None]:
df = pd.read_csv("data/Market_Sizes.csv", delimiter=',')

df.dropna(inplace=True)
# Convert 'year_date' column to datetime format
df['Year_date'] = pd.to_datetime(df['Year_date'])

# Convert the date format to 'MM/DD/YYYY'
df['Year_date'] = df['Year_date'].dt.strftime('%d/%m/%Y')

# Columns to round up
columns_to_round_up = ["RSP", "Volume"]

# Round up the columns
df[columns_to_round_up] = df[columns_to_round_up].round().astype(int)

# Display the data types of each column after rounding up
print(df[columns_to_round_up].dtypes)

df.head()



RSP       int32
Volume    int32
dtype: object


  df['Year_date'] = pd.to_datetime(df['Year_date'])


Unnamed: 0,Location,Industry,Subcategory,Hierarchy_Level,Data_Type,Unit,Current_Constant,Currency_Conversion,Year,Year_date,RSP,Volume,Year_minus_2016,Year_minus_2022,Edition
0,1,Alcoholic Drinks,1,3,Total Value RSP,USD million,"Historic Constant 2021 Prices, Forecast Consta...","Historic Fixed 2021 Exchange Rates, Forecast F...",2016,31/12/2016,301,27,4,-2,2022
1,1,Alcoholic Drinks,1,3,Total Value RSP,USD million,"Historic Constant 2021 Prices, Forecast Consta...","Historic Fixed 2021 Exchange Rates, Forecast F...",2017,31/12/2017,318,28,5,-1,2022
2,1,Alcoholic Drinks,1,3,Total Value RSP,USD million,"Historic Constant 2021 Prices, Forecast Consta...","Historic Fixed 2021 Exchange Rates, Forecast F...",2018,31/12/2018,458,40,6,0,2022
3,1,Alcoholic Drinks,1,3,Total Value RSP,USD million,"Historic Constant 2021 Prices, Forecast Consta...","Historic Fixed 2021 Exchange Rates, Forecast F...",2019,31/12/2019,612,51,7,1,2022
4,1,Alcoholic Drinks,1,3,Total Value RSP,USD million,"Historic Constant 2021 Prices, Forecast Consta...","Historic Fixed 2021 Exchange Rates, Forecast F...",2020,31/12/2020,774,62,8,2,2022


In [10]:
import pandas as pd
import os

# Custom formatting function
def format_with_periods(x):
    return '{:,.8f}'.format(x).replace(',', '.')

def capitalize_words(text):
    return ' '.join(word.capitalize() for word in text.split())

# Read the CSV file
df = pd.read_csv("data/Channel_Volume.csv", delimiter=',')

# Drop rows with missing values
df.dropna(inplace=True)

# Convert 'Year_date' column to datetime format
df['Year_date'] = pd.to_datetime(df['Year_date'])

# Convert the date format to 'DD/MM/YYYY'
df['Year_date'] = df['Year_date'].dt.strftime('%d/%m/%Y')

# Replace commas with periods in 'Volume' column and convert to float
df['Volume'] = df['Volume'].str.replace(',', '.').astype(float)

# Make units and volume the same
for index, row in df.iterrows():
    if row["Unit"] != 'million litres':
        df.at[index, 'Volume'] /= 1000000
        df.at[index, 'Unit'] = 'million litres'

# Round the 'Volume' column to 8 decimal places
df['Volume'] = df['Volume'].round(8)

# Convert each value in the 'Volume' column to a string with commas separating thousands and a comma before the decimal part
df['Volume'] = df['Volume'].apply(format_with_commas)

# Apply the function to every string column
for column in df.select_dtypes(include='object'):
    df[column] = df[column].str.lower().apply(capitalize_words)
    
# Create the directory if it doesn't exist
directory = "clean_data"
if not os.path.exists(directory):
    os.makedirs(directory)

# Save the DataFrame to CSV
df.to_csv(os.path.join(directory, "Channel_Volume.csv"), index=False)

In [None]:
df_cat = pd.read_csv("data/Categories.csv", delimiter=',')
df_subcat = pd.read_csv("data/Subcategories.csv", delimiter=',')

df_cat.dropna(inplace=True)
df_subcat.dropna(inplace=True)

df_subcat.head()

# Create the directory if it doesn't exist
directory = "clean_data"
if not os.path.exists(directory):
    os.makedirs(directory)

# Save the DataFrame to CSV in the specified directory
df_cat.to_csv(os.path.join(directory, "Categories.csv"), index=False)
df_subcat.to_csv(os.path.join(directory, "Subcategories.csv"), index=False)