### Sales Data Analysis:

The following analysis has been performed using the sales_data.csv and steps provided within the Data Analysis with Python on freeCodeCamp:

https://www.freecodecamp.org/learn/data-analysis-with-python/data-analysis-with-python-course/data-analysis-example-a

In [None]:
# import libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
"""Parse Dates: The parse_dates=['Date'] argument tells pandas to automatically convert the values in the Date column into datetime objects. 
This is useful for working with dates, as it allows you to perform operations like filtering by time ranges or extracting components such as year, month, or day."""

# change file path as required:
sales = pd.read_csv(r'Data_Analysis\Data\sales_data.csv', parse_dates=['Date'])

In [None]:
# review top 5 rows to assess data structure:
sales.head()

In [None]:
# 113K rows by 18 columns:
sales.shape

In [None]:
# sanity check the columns and data types are as expected:
sales.info()

In [None]:
# descriptive stats of all numeric fields:
sales.describe()

In [None]:
# review descriptive stats on Unit_Cost field alone:
sales['Unit_Cost'].describe()

In [None]:
sales['Unit_Cost'].mean()

In [None]:
sales['Unit_Cost'].median()

In [None]:
# plot the distribution of the Unit_Cost column with a histogram:
sales['Unit_Cost'].plot(kind='hist', figsize=(14,6))

# Add axis labels
plt.xlabel('Unit Cost')  # X-axis label
plt.ylabel('Frequency') # Y-axis label

# Show the plot
plt.show()

In [None]:
# box plot displayed horizontally (vert=False):
sales['Unit_Cost'].plot(kind='box', vert=False,figsize=(14,6))

# Show the plot
plt.show()

In [None]:
ax = sales['Unit_Cost'].plot(kind='density', figsize=(14,6))

# Set vertical lines for mean and median:
mean = sales['Unit_Cost'].mean()
median = sales['Unit_Cost'].median()
ax.axvline(mean, color='red', linestyle='--', label='Mean')
ax.axvline(median, color='green', linestyle='--', label='Median')

# Set axis labels:
ax.set_ylabel('Number of Sales')
ax.set_xlabel('Unit Cost')

# Add a legend to clarify the lines:
ax.legend()

plt.show()

In [None]:
sales['Age_Group'].value_counts()

In [None]:
sales['Age_Group'].value_counts().plot(kind='pie', figsize=(6,6),  autopct='%1.1f%%')

In [None]:
sales['Product_Category'].value_counts()

In [None]:
sales['Product_Category'].value_counts().plot(kind='pie', figsize=(6,6),  autopct='%1.1f%%')

In [None]:
sales['Product'].value_counts()

In [None]:
# correlation matrix -1 negative, 0 none, 1 positive:
corr = sales.corr(numeric_only=True)

print(corr)

In [None]:
# plot correlation matrix:
fig = plt.figure(figsize=(8,8))
plt.matshow(corr, cmap='RdBu', fignum=fig.number)
plt.colorbar()  # Adding a color legend
plt.xticks(range(len(corr.columns)), corr.columns, rotation='vertical');
plt.yticks(range(len(corr.columns)), corr.columns)

In [None]:
# deep dive Revenue vs Profit using a scatterplot:

# Create scatterplot
ax = sales.plot(kind='scatter', x='Revenue', y='Profit', figsize=(6, 6))

# Calculate the line of best fit
x = sales['Revenue']
y = sales['Profit']
m, b = np.polyfit(x, y, 1)  # m is the slope, b is the intercept

# Plot the line of best fit
plt.plot(x, m * x + b, color='red', label='Line of Best Fit')

# Add labels and legend
plt.xlabel('Revenue')
plt.ylabel('Profit')
plt.legend()

# Display the plot
plt.show()

In [None]:
# deep dive Cost vs Profit using a scatterplot:

# Create scatterplot
ax = sales.plot(kind='scatter', x='Cost', y='Profit', figsize=(6, 6))

# Calculate the line of best fit
x = sales['Cost']
y = sales['Profit']
m, b = np.polyfit(x, y, 1)  # m is the slope, b is the intercept

# Plot the line of best fit
plt.plot(x, m * x + b, color='red', label='Line of Best Fit')

# Add labels and legend
plt.xlabel('Cost')
plt.ylabel('Profit')
plt.legend()

# Display the plot
plt.show()

In [None]:
# FILTERING - return all the sales from the state of Kentucky:
sales.loc[sales['State']=='Kentucky']

In [None]:
# Return the mean revenue of the 'Adults (35-64) sales group:
sales.loc[sales['Age_Group']=='Adults (35-64)', 'Revenue'].mean().round(2)

In [None]:
# Return the mean revenue of the 'Adults (35-64) sales group + USA using &:
sales.loc[(sales['Age_Group']=='Adults (35-64)') & (sales['Country']=='United States'), 'Revenue'].mean().round(2)

In [None]:
# Increase the Unit_Price of every sale to France or Germany by 10%
sales.loc[(sales['Country']=='France') | (sales['Country']=='Germany'), 'Unit_Price']*= 1.1

sales.loc[(sales['Country']=='France') | (sales['Country']=='Germany'), 'Unit_Price']