### Load data 

In [1]:
# Importing pandas library
import pandas as pd 

# Load all datasets into pandas dataframes
customers = pd.read_csv('Input Datasets\\Customers.csv', encoding='ISO-8859-1')
exchange_rates = pd.read_csv('Input Datasets\\Exchange_Rates.csv', encoding='ISO-8859-1')
products = pd.read_csv('Input Datasets\\Products.csv', encoding='ISO-8859-1')
sales = pd.read_csv('Input Datasets\\Sales.csv', encoding='ISO-8859-1')
stores = pd.read_csv('Input Datasets\\Stores.csv', encoding='ISO-8859-1')

### Convert data types where necessary

In [2]:
# convert to datetime
customers['Birthday'] = pd.to_datetime(customers['Birthday']).dt.date
exchange_rates['Date'] = pd.to_datetime(exchange_rates['Date']).dt.date
sales['Order Date'] = pd.to_datetime(sales['Order Date'])  
sales['Delivery Date'] = pd.to_datetime(sales['Delivery Date'])  
stores['Open Date'] = pd.to_datetime(stores['Open Date']).dt.date

### Check for missing and duplicate values

In [None]:
# Check for missing values for all the datasets  
print(customers.isnull().sum()) 
print(exchange_rates.isnull().sum())    
print(products.isnull().sum())  
print(sales.isnull().sum())
print(stores.isnull().sum())

In [None]:
# Check for duplicates in all the datasets
print(customers.duplicated().sum())
print(exchange_rates.duplicated().sum())
print(products.duplicated().sum())
print(sales.duplicated().sum())
print(stores.duplicated().sum())

### Handling the missing values appropriately.

In [None]:
# Handling missing values in the customers dataset
customers['State Code'].fillna('Unknown',inplace=True)  # Fill missing State Code with 'Unknown'
customers['Zip Code'] = customers['Zip Code'].fillna(0)  # Fill missing values in 'Zip Code' with 0


# Handling missing values in the sales dataset
day = (sales['Delivery Date'] - sales['Order Date']).dt.days    # Calculate the difference between 'Delivery Date' and 'Order Date'   # Convert the difference to days
mean_day = day.mean()   # Calculate the mean of the difference between 'Delivery Date' and 'Order Date'
sales['Delivery Date'].fillna(sales['Order Date'] + pd.to_timedelta(mean_day, unit='D'),inplace=True)   # Fill missing values in 'Delivery Date' with the mean of the difference between 'Delivery Date' and 'Order Date'
sales['Delivery Date'] = sales['Delivery Date'].dt.date  # Convert 'Delivery Date' to date


# Handling missing values in the stores dataset
stores['Square Meters'].fillna(stores['Square Meters'].mean(), inplace=True)    # Fill missing values in 'Square Meters' with the mean of 'Square Meters'

In [None]:
# Check for missing values for all the datasets  
print(customers.isnull().sum()) 
print(exchange_rates.isnull().sum())    
print(products.isnull().sum())  
print(sales.isnull().sum())
print(stores.isnull().sum())

### store data in a database

In [7]:
import sqlite3

# Create a database connection
conn = sqlite3.connect('Output Storage\\data.db')

# Load all datasets into the database
customers.to_sql('customers', conn, if_exists='replace', index=False)
exchange_rates.to_sql('exchange_rates', conn, if_exists='replace', index=False)
products.to_sql('products', conn, if_exists='replace', index=False)
sales.to_sql('sales', conn, if_exists='replace', index=False)
stores.to_sql('stores', conn, if_exists='replace', index=False)

# Close the database connection
conn.close()
