In [2]:
import pyodbc
import pandas as pd

import warnings
warnings.filterwarnings('ignore') 

# defining the server and the database
server = '(localdb)\mssqllocaldb' 
database = 'AdventureWorksDW2012'  

# Define the connection string
conn = pyodbc.connect(
    'DRIVER={ODBC Driver 17 for SQL Server}; \
    SERVER='+ server +'; \
    DATABASE ='+ database +';\
    Trusted_Connection=yes;'
)


### Using a TOP 100 filter to limit the amount of data pulled from the exploratory query on 
### [AdventureWorksDW2012].[dbo].[FactInternetSales] for questions  1 - 5.

In [3]:
eda_query = pd.read_sql_query("""SELECT TOP 100 * FROM [AdventureWorksDW2012].[dbo].[FactInternetSales] AS Fis""",conn)
eda_query

Unnamed: 0,ProductKey,OrderDateKey,DueDateKey,ShipDateKey,CustomerKey,PromotionKey,CurrencyKey,SalesTerritoryKey,SalesOrderNumber,SalesOrderLineNumber,...,ProductStandardCost,TotalProductCost,SalesAmount,TaxAmt,Freight,CarrierTrackingNumber,CustomerPONumber,OrderDate,DueDate,ShipDate
0,310,20050701,20050713,20050708,21768,1,19,6,SO43697,1,...,2171.2942,2171.2942,3578.2700,286.2616,89.4568,,,2005-07-01,2005-07-13,2005-07-08
1,346,20050701,20050713,20050708,28389,1,39,7,SO43698,1,...,1912.1544,1912.1544,3399.9900,271.9992,84.9998,,,2005-07-01,2005-07-13,2005-07-08
2,346,20050701,20050713,20050708,25863,1,100,1,SO43699,1,...,1912.1544,1912.1544,3399.9900,271.9992,84.9998,,,2005-07-01,2005-07-13,2005-07-08
3,336,20050701,20050713,20050708,14501,1,100,4,SO43700,1,...,413.1463,413.1463,699.0982,55.9279,17.4775,,,2005-07-01,2005-07-13,2005-07-08
4,346,20050701,20050713,20050708,11003,1,6,9,SO43701,1,...,1912.1544,1912.1544,3399.9900,271.9992,84.9998,,,2005-07-01,2005-07-13,2005-07-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,312,20050722,20050803,20050729,16623,1,6,9,SO43792,1,...,2171.2942,2171.2942,3578.2700,286.2616,89.4568,,,2005-07-22,2005-08-03,2005-07-29
96,344,20050722,20050803,20050729,11000,1,6,9,SO43793,1,...,1912.1544,1912.1544,3399.9900,271.9992,84.9998,,,2005-07-22,2005-08-03,2005-07-29
97,347,20050722,20050803,20050729,11029,1,6,9,SO43794,1,...,1912.1544,1912.1544,3399.9900,271.9992,84.9998,,,2005-07-22,2005-08-03,2005-07-29
98,312,20050723,20050804,20050730,27615,1,100,4,SO43795,1,...,2171.2942,2171.2942,3578.2700,286.2616,89.4568,,,2005-07-23,2005-08-04,2005-07-30


### 1. How much tax was collected on valentine’s day in 2008 from internet 
### sales? Lowest/highest tax sales for that day?

In [None]:
# Thought Process:
# Querying table [AdventureWorksDW2012].[dbo].[FactInternetSales]
# Sum() function to calculate total tax collected using OrderDate field
# Date '2008-02-14'
# Using ORDER BY statement to sort tax sales by lowest/hightest
sql_query = pd.read_sql_query("""SELECT FORMAT(SUM(Fis.TaxAmt),'C') AS [Total Tax Collected], 
                                 FORMAT(MIN(Fis.TaxAmt),'C') AS [Minimum Tax Collected],
                                 FORMAT(MAX(Fis.TaxAmt),'C') AS [Maximum Tax Collected], 
                                 COUNT(1) AS [Number of Transactions]
                                 FROM [AdventureWorksDW2012].[dbo].[FactInternetSales] AS Fis
                                 WHERE Fis.OrderDate = '2008-02-14'""", conn)
sql_query

### 2. How much tax is collected per territory from internet sales?

In [None]:
# Thought Process:
# Querying table [AdventureWorksDW2012].[dbo].[FactInternetSales]
# Using SELECT Statement included SalesTerritoryKey to use ORDER BY for grouping totals
# Sum() function to calculate total tax collected using OrderDate field
# Using GROUP BY statement to group totals by their SalesTerritoryKey
# Using ORDER BY statement to sort tax sales by lowest/hightest
sql_query = pd.read_sql_query("""SELECT SalesTerritoryKey AS [Territory ID],
                                 FORMAT(SUM(Fis.TaxAmt),'C') AS [Total Tax Collected],
                                 FORMAT(MIN(Fis.TaxAmt),'C') AS [Minimum Tax Collected],
                                 FORMAT(MAX(Fis.TaxAmt),'C') AS [Maximum Tax Collected], 
                                 COUNT(1) AS [Number of Transactions]
                                 FROM [AdventureWorksDW2012].[dbo].[FactInternetSales] AS Fis
                                 GROUP BY SalesTerritoryKey
                                 ORDER BY SalesTerritoryKey""", conn)
sql_query

### 3. How much tax is collected per territory from internet sales? (only 
### include those territories where the tax amount collect is greater than 10K dollars

In [None]:
# Thought Process:
# Querying table [AdventureWorksDW2012].[dbo].[FactInternetSales]
# Using SELECT Statement included SalesTerritoryKey to use ORDER BY for grouping totals
# Sum() function to calculate total tax collected using OrderDate field
# Using GROUP BY statement to group totals by their SalesTerritoryKey
# Using Having statement to filter results with tax collect greater than 10000 dollars
# Using ORDER BY statement to sort tax sales by lowest/hightest
sql_query = pd.read_sql_query("""SELECT SalesTerritoryKey AS [Territory ID],
                                 FORMAT(SUM(Fis.TaxAmt),'C') AS [Total Tax Collected],
                                 FORMAT(MIN(Fis.TaxAmt),'C') AS [Minimum Tax Collected],
                                 FORMAT(MAX(Fis.TaxAmt),'C') AS [Maximum Tax Collected], 
                                 COUNT(1) AS [Number of Transactions]
                                 FROM [AdventureWorksDW2012].[dbo].[FactInternetSales] AS Fis
                                 GROUP BY SalesTerritoryKey
                                 HAVING SUM(Fis.TaxAmt) > 10000
                                 ORDER BY SalesTerritoryKey""", conn)
sql_query

### 4. Compare the success (total sales Amount) of each promotion between 
### territories using the internet sales.

In [9]:
# Thought Process:
# Querying table [AdventureWorksDW2012].[dbo].[FactInternetSales]
# Using SELECT Statement included SalesTerritoryKey to use ORDER BY for grouping totals
# Sum() function to calculate total tax collected using OrderDate field
# Using GROUP BY statement to group totals by their Promotion Key then by SalesTerritoryKey
# Using ORDER BY statement to sort tax sales by lowest/hightest
sql_query = pd.read_sql_query("""SELECT PromotionKey AS [Promotion Key],
                                 SalesTerritoryKey AS [Territory ID],
                                 FORMAT(SUM(Fis.TaxAmt),'C') AS [Total Tax Collected],
                                 COUNT(1) AS [Number of Transactions]
                                 FROM [AdventureWorksDW2012].[dbo].[FactInternetSales] AS Fis
                                 GROUP BY SalesTerritoryKey, PromotionKey
                                 HAVING SUM(Fis.TaxAmt) > 10000
                                 ORDER BY PromotionKey, SUM(Fis.TaxAmt)""", conn)
sql_query

Unnamed: 0,Promotion Key,Territory ID,Total Tax Collected,Number of Transactions
0,1,6,"$148,268.01",7362
1,1,7,"$195,898.98",5363
2,1,8,"$212,838.83",5423
3,1,10,"$250,280.06",6637
4,1,1,"$270,955.68",8686
5,1,4,"$428,521.59",11855
6,1,9,"$676,147.45",12839
7,2,7,"$14,944.06",188
8,2,8,"$18,337.27",198
9,2,1,"$20,474.04",302


### 5. How many unique customers does adventure Works have in territory #3

In [19]:
# Thought Process:
# Querying table [AdventureWorksDW2012].[dbo].[FactInternetSales]
# Using SELECT Statement including DISTINCT to get all of the uniqueCustomerKey
# Using WHERE statement to filter results only for SalesTerritoryKey  = 3
# Using ORDER BY statement to sort CustomerKey by lowest/hightest
sql_query = pd.read_sql_query("""SELECT Fis.SalesTerritoryKey, COUNT(DISTINCT Fis.CustomerKey) AS [Total Number of Customers]
                                 FROM [AdventureWorksDW2012].[dbo].[FactInternetSales] AS Fis
                                 WHERE Fis.SalesTerritoryKey = 3
                                 GROUP BY Fis.SalesTerritoryKey""", conn)
sql_query

Unnamed: 0,SalesTerritoryKey,Total Number of Customers
0,3,8


### 6. Count the Products that have “Finished Goods” (use 
### [FinishedGoodsFlag] column)


In [7]:
# Using COUNT() Function to get the count for Finished Goods
# SELECT Statemet will get FinishedGoodsFlag column
# Using table [AdventureWorksDW2012].[dbo].[DimProduct]
# Using WHERE  statement FinishedGoodsFlag = 1 

# First Iteration
sql_query = pd.read_sql_query("""SELECT COUNT(1) AS [Count of Finished Goods]
                                 FROM [AdventureWorksDW2012].[dbo].[DimProduct]
                                 WHERE FinishedGoodsFlag = 1""", conn)
sql_query

Unnamed: 0,Count of Finished Goods
0,397


In [6]:
# Second Iteration
# Using CAST to get the 
sql_query = pd.read_sql_query("""SELECT SUM(CAST(FinishedGoodsFlag as int)) AS [Count of Finished Goods]
                                 FROM [AdventureWorksDW2012].[dbo].[DimProduct]""", conn)
sql_query

Unnamed: 0,Unnamed: 1
0,397


In [11]:
# Third Iteration
# Using Case to get the sum
sql_query = pd.read_sql_query("""SELECT SUM(case when [FinishedGoodsFlag] = 1 then 1 else 0 end) AS [Count of Finished Goods]
                                 FROM [AdventureWorksDW2012].[dbo].[DimProduct]""", conn)
sql_query

Unnamed: 0,Count of Finished Goods
0,397
