In [9]:
import sklearn
import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.stats import f_oneway as f1, chi2_contingency as chi
from numpy import corrcoef as cce

>#### reading DATA from csv file

In [7]:
data = pd.read_csv("Data.csv")                  
data.head()

Unnamed: 0.1,Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Customer ID,Sales,Datetime,Period,Day,Month,City State
0,0,176558,USB-C Charging Cable,2,11.95,75315,23.9,2019-04-19 08:46:00,8,Friday,Apr,Dallas TX
1,1,176559,Bose SoundSport Headphones,1,99.99,59963,99.99,2019-04-07 22:30:00,22,Sunday,Apr,Boston MA
2,2,176560,Google Phone,1,600.0,25532,600.0,2019-04-12 14:38:00,14,Friday,Apr,Los Angeles CA
3,3,176560,Wired Headphones,1,11.99,25532,11.99,2019-04-12 14:38:00,14,Friday,Apr,Los Angeles CA
4,4,176561,Wired Headphones,1,11.99,89128,11.99,2019-04-30 09:27:00,9,Tuesday,Apr,Los Angeles CA


In [6]:
# dropping unwanted columns
data.drop(["Unnamed: 0"], axis=1, inplace=True)
data.tail()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Customer ID,Sales,Datetime,Period,Day,Month,City State
189080,259353,AAA Batteries (4-pack),3,2.99,254883,8.97,2019-09-17 20:56:00,20,Tuesday,Sep,Los Angeles CA
189081,259354,iPhone,1,700.0,175553,700.0,2019-09-01 16:00:00,16,Sunday,Sep,San Francisco CA
189082,259355,iPhone,1,700.0,133937,700.0,2019-09-23 07:39:00,7,Monday,Sep,San Francisco CA
189083,259356,34in Ultrawide Monitor,1,379.99,189758,379.99,2019-09-19 17:30:00,17,Thursday,Sep,San Francisco CA
189084,259357,USB-C Charging Cable,1,11.95,276924,11.95,2019-09-30 00:18:00,0,Monday,Sep,San Francisco CA


> ### Hypothesis Testing

**Chi Square Test**
* Testing for association between `Days of the week` and `Cities`
* Testing for association between `Months` and `Cities`

  Trying to find out how much influence ***what day it is*** or ***what month it is***, has on a City and how customers there, place orders and make purchases

##### *What Day it is...*

In [11]:
# cross count between 'Day' and 'City State' series
dc_cross = pd.crosstab(data["Day"], data["City State"])
dc_cross

City State,Atlanta GA,Austin TX,Boston MA,Dallas TX,Los Angeles CA,New York City NY,Portland ME,Portland OR,San Francisco CA,Seattle WA
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Friday,2054,1391,2742,2086,4142,3477,352,1459,6469,2030
Monday,2276,1445,2974,2180,4490,3660,368,1509,6522,2079
Saturday,2090,1427,2812,2101,4160,3561,377,1431,6390,2070
Sunday,2207,1400,2821,2094,4295,3535,321,1399,6363,2181
Thursday,2164,1467,2928,2120,4293,3711,344,1391,6431,2170
Tuesday,2210,1471,3026,2267,4548,3800,386,1622,6715,2231
Wednesday,2107,1437,2901,2120,4247,3585,362,1398,6682,2211


In [12]:
# testing with Chi Square
chi(dc_cross)

(67.72264386298475,
 0.09927746987567522,
 54,
 array([[2093.55483513, 1390.99175503, 2799.72080281, 2074.15467118,
         4181.42819367, 3509.90537589,  347.81722506, 1414.68766957,
         6315.03051009, 2074.70896158],
        [2197.50548166, 1460.05824894, 2938.73449507, 2177.14204723,
         4389.0473861 , 3684.18164846,  365.08728879, 1484.93072957,
         6628.58881455, 2177.72385964],
        [2110.89325965, 1402.51168522, 2822.90756009, 2091.33242722,
         4216.05798979, 3538.97374726,  350.69778142, 1426.40384483,
         6367.33039638, 2091.89130814],
        [2126.63367269, 1412.96987069, 2843.95728905, 2106.92697993,
         4247.49609964, 3565.36300606,  353.31284872, 1437.04018828,
         6414.81001666, 2107.49002829],
        [2158.83360393, 1434.36402676, 2887.01840971, 2138.82852685,
         4311.80857815, 3619.34712431,  358.66245339, 1458.79879948,
         6511.93837692, 2139.40010048],
        [2259.26862522, 1501.09468229, 3021.33063966, 2238.3328

***
The first value (67.723) is the `Chi-square value`, followed by the `p-value` (0.099), then comes the `degrees of freedom` (54), and lastly it outputs the expected frequencies as an array. Since all of the expected frequencies are *greater than 5*, the chi2 test results can be trusted. We can accept the null hypothesis as the `p-value` is `greater than 0.05`. Thus, the results indicate that there is **NO relationship** between the `Day` of the week and how customers in each `City` spend or place orders.
***

#### *What Month it is...*

In [13]:
# cross count between 'Day' and 'City State' series
mc_cross = pd.crosstab(data["Month"], data["City State"])
mc_cross

City State,Atlanta GA,Austin TX,Boston MA,Dallas TX,Los Angeles CA,New York City NY,Portland ME,Portland OR,San Francisco CA,Seattle WA
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Apr,2942,1946,3824,2696,6046,4866,486,1984,8850,2882
Aug,941,649,1244,968,1871,1629,166,656,2832,1005
Dec,1989,1348,2695,2007,3899,3402,315,1332,6019,1978
Feb,936,625,1255,984,1900,1609,177,657,2905,927
Jan,785,529,1051,784,1529,1313,123,502,2366,727
Jul,1146,763,1553,1102,2230,1930,198,721,3531,1119
Jun,1107,715,1390,1063,2203,1775,197,745,3303,1056
Mar,1,1,2,0,2,2,0,2,7,0
May,1330,838,1812,1423,2571,2268,222,947,3940,1243
Nov,1414,869,1919,1368,2746,2355,230,936,4315,1421


In [14]:
# testing with Chi Square
chi(mc_cross)

(124.35997227970886,
 0.04322847287220186,
 99,
 array([[2.91812876e+03, 1.93885203e+03, 3.90242742e+03, 2.89108758e+03,
         5.82833831e+03, 4.89232746e+03, 4.84809583e+02, 1.97188089e+03,
         8.80228778e+03, 2.89186019e+03],
        [9.55690763e+02, 6.34976429e+02, 1.27804979e+03, 9.46834746e+02,
         1.90878798e+03, 1.60224327e+03, 1.58775736e+02, 6.45793421e+02,
         2.88276009e+03, 9.47087775e+02],
        [1.99623594e+03, 1.32633150e+03, 2.66957578e+03, 1.97773759e+03,
         3.98705450e+03, 3.34674742e+03, 3.31648941e+02, 1.34892591e+03,
         6.02147631e+03, 1.97826611e+03],
        [9.56809371e+02, 6.35719650e+02, 1.27954571e+03, 9.47942989e+02,
         1.91102216e+03, 1.60411865e+03, 1.58961578e+02, 6.46549303e+02,
         2.88613428e+03, 9.48196314e+02],
        [7.75754671e+02, 5.15423973e+02, 1.03742040e+03, 7.68566052e+02,
         1.54940410e+03, 1.30057520e+03, 1.28881667e+02, 5.24204358e+02,
         2.33999814e+03, 7.68771441e+02],
        [1.1

***
The `Chi-square value` this time is (124.360), followed by the `p-value` ( 0.043), then comes the `degrees of freedom` (99), and lastly, the expected frequencies outputted as an array. Since all of the expected frequencies are *greater than 5*, this chi2 test results can also be trusted. We can reject the null hypothesis as the `p-value` is `less than 0.05`. Thus, the results indicate that there is **A relationship** between the `Month` of the year and how customers in each `City` spend or place orders.
***

**Oneway - ANOVA Test**

Testing for association between a numerical series and a categorical series with at least three(3) categories
* Testing for association between `Sales` and `Day` of the week.
  Testing to find out how much influence ***what day it is*** has on Sales

In [15]:
# Splitting a category with 7 categories
sun = data[data["Day"] == "Sunday"]
mon = data[data["Day"] == "Monday"]
tue = data[data["Day"] == "Tuesday"]
wed = data[data["Day"] == "Wednesday"]
thu = data[data["Day"] == "Thursday"]
fri = data[data["Day"] == "Friday"]
sat = data[data["Day"] == "Saturday"]

In [16]:
f1(sun['Sales'], mon['Sales'], tue['Sales'], wed['Sales'], thu['Sales'], fri['Sales'], sat['Sales'])

F_onewayResult(statistic=1.4012440524909306, pvalue=0.20975131788753928)

***
`p_value` here is (0.210). `Oneway-ANOVA Test` confirms the Null hypothesis of the `Chi Squared Test`, that there is **NO relationship** between what `Day` it is and how customers spend or place orders.
***  

**Correlation CoEfficient Test**

Testing the strenght of association between two numerical series 
* Testing for association between `Sales` and `Period` of day.
* Testing for association between `Sales` and `Price Each`

  Testing to find out how much influence ***what time it is*** or ***what price per item*** has on sales
  
  A `positive cce` indicates a **directly proportional relationship**, a `negative cce` indicates an **inversely proportional relationship**. The strenght of either outcome, is what we're trying to determine.
  
  
* cce of `+/- 0.75 to 1` = ***Strong*** 
* cce of `+/- 0.5 to 0.69` = ***Moderate***
* cce of `+/- 0.3 to 0.49` = ***Weak***

In [20]:
# checking correlation co-efficient between "Sales" and "Period"
cce(data["Sales"], data["Period"])

array([[1.      , 0.001287],
       [0.001287, 1.      ]])

***
With a `cce` of (0.001287), the result does not fall within the purview of our criteria for strenght. It further confirms both null hypothesis from the `Chi Square Test` and `Oneway-ANOVA test`; there is **NO relationship/association** between the day of the week/time of day and Sales.
***

In [24]:
# checking correlation co-efficient between "Sales" and "Price Each"
cce(data["Sales"], data["Price Each"])

array([[1.       , 0.9990597],
       [0.9990597, 1.       ]])

***
With a `cce` of (0.9990597), this is **Positive Correlation**. The result indicates **STRONG *directly proportional relationship*** between `sales` and `Price Each`. 
***

In [25]:
# checking correlation co-efficient between "Sales" and "Quantity Ordered"
cce(data["Sales"], data["Quantity Ordered"])

array([[ 1.        , -0.13958014],
       [-0.13958014,  1.        ]])

***
With a `cce` of (0.9990597), this is a **Negative Correlation**. The result indicates **WEAK *inversely proportional relationship*** between `sales` and `Quanity Ordered`. 
This doesn't convincingly explain the similarity between our charts `Monthly Sales Trend` and `Monthly Orders Trend` as i hoped it would. 
***


* Not Enough Data points with which to carry out *conclusive* **Predictive Analysis**