In [1]:
import pandas as pd
import psycopg2
from getpass import getpass 
#import math
from scipy import stats

In [2]:
#pa = getpass('please enter the db admin password:')
pa = ''
conn = psycopg2.connect("dbname=hhc_db1 user=postgres password=%s" %pa)
cur = conn.cursor()

In [3]:
cur.execute('''select 
                current_zone_name, 
                dwell_time_in_sec          
                from mexia_ssi_data
                where current_zone_name like ('Victoria%')
                or current_zone_name like ('Regal Theater%')
                ;''')

In [4]:
rawExtract = pd.DataFrame(cur.fetchall(),
                          columns = [i[0] for i in cur.description]) 

In [5]:
def removeCoordinates(x):
    #this function is designed to remove the coordinate on the end of the zone name
    piecedString = x.split()
    if piecedString[-1] in ('N','S','E','W'):
        x = x[0:-1].strip()
    elif piecedString[-1] in ('NE','NW','SE','SW'):
        x = x[0:-2].strip()
    else:
        pass
    return x.strip()

In [6]:
rawExtract['current_zone_grouped']=rawExtract['current_zone_name'].apply(removeCoordinates)


In [None]:
#Null hypothesis
# The time spent by the shoppers at Victoria's Secret is equal 
# to the time spent at the Regal Movie Theater

#Alt Hypthesis
# The time spent is not equal between the two stores.

#CI = 95%
#a = 0.05

In [7]:
regal = rawExtract.loc[rawExtract['current_zone_grouped'] =='Regal Theater']
vs = rawExtract.loc[rawExtract['current_zone_grouped'] =="Victoria's Secret"]

In [8]:
#Regal Theater
n1 = regal.shape[0]
x1 = regal['dwell_time_in_sec'].mean()
s1 = regal['dwell_time_in_sec'].var()

#Victoria's Secret
n2 = vs.shape[0]
x2 = vs['dwell_time_in_sec'].mean()
s2 = vs['dwell_time_in_sec'].var()

k = int(n1) + int(n2) -2
 

In [9]:
stats.ttest_ind(regal['dwell_time_in_sec'].values,
                vs['dwell_time_in_sec'].values,
                equal_var=False)

Ttest_indResult(statistic=22.505994882595544, pvalue=4.4876645886373825e-112)

<h2> histograms </h2>

In [10]:
import plotly.plotly as py
import plotly.graph_objs as go

In [11]:
#plotlyPass = getpass('please enter the ploty API secret key:')
plotlyPass = ''
py.sign_in('CJ_Foley','%s' %plotlyPass)

In [13]:
regalTrace = go.Histogram(
        x=regal['dwell_time_in_sec'],
        opacity=0.75)

vsTrace = go.Histogram(
        x=vs['dwell_time_in_sec'],
        opacity=0.75)

dataSets = [regalTrace]

chartLayout = go.Layout(
    barmode='overlay')


fig = go.Figure(data=dataSets, layout=chartLayout)
py.iplot(fig)


Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points




In [14]:
regalHistData = regal.groupby(['current_zone_grouped',
               'dwell_time_in_sec']).count().reset_index()

regalHistData = regalHistData[['dwell_time_in_sec','current_zone_name']]
regalHistData.columns = ['Time Spent','Visitor Count']

In [15]:
vsHistData = vs.groupby(['current_zone_grouped',
               'dwell_time_in_sec']).count().reset_index()

vsHistData = vsHistData[['dwell_time_in_sec','current_zone_name']]
vsHistData.columns = ['Time Spent','Visitor Count']

<h2> all data </h2>

In [19]:
trace1 = go.Bar(
    x=regalHistData['Time Spent'],
    y=regalHistData['Visitor Count'],
    name='Regal Theater')

trace2 = go.Bar(
    x=vsHistData['Time Spent'],
    y=vsHistData['Visitor Count'],
    name="Victori's Secret")

data = [trace1,trace2]

layout = go.Layout(
    barmode='group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Time Spent Histogram')

In [41]:
#between 1 and 150 minutes

regalData = regalHistData.loc[(regalHistData['Time Spent'] > 0) & (regalHistData['Time Spent'] < 900000000)].copy()
vsData = vsHistData.loc[(vsHistData['Time Spent'] > 0) & (vsHistData['Time Spent'] < 900000000)].copy()


trace1 = go.Bar(
    x=regalData['Time Spent'],
    y=regalData['Visitor Count'],
    name='Regal Theater')

trace2 = go.Bar(
    x=vsData['Time Spent'],
    y=vsData['Visitor Count'],
    name="Victori's Secret")

data = [trace1,trace2]

layout = go.Layout(
    title='''Distribution of Visitors for Victoria's Secret and Regal Theaters<br>One to One-Hundred and Fifty Minutes''',
    xaxis=dict(title='Time Spent'),
    yaxis=dict(title='Number of Visitors'),
    barmode='group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Time Spent Histogram')

In [42]:
#between 1 and 30 minutes

regalData = regalHistData.loc[(regalHistData['Time Spent'] > 60) & (regalHistData['Time Spent'] < 600)].copy()
vsData = vsHistData.loc[(vsHistData['Time Spent'] > 60) & (vsHistData['Time Spent'] < 600)].copy()


trace1 = go.Bar(
    x=regalData['Time Spent'],
    y=regalData['Visitor Count'],
    name='Regal Theater')

trace2 = go.Bar(
    x=vsData['Time Spent'],
    y=vsData['Visitor Count'],
    name="Victori's Secret")

data = [trace1,trace2]

layout = go.Layout(
    title='''Distribution of Visitors for Victoria's Secret and Regal Theaters<br>One to Ten Minutes''',
    xaxis=dict(title='Time Spent'),
    yaxis=dict(title='Number of Visitors'),
    barmode='group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Time Spent Histogram')

In [40]:
#between 90 minutes and 150 minutes

regalData = regalHistData.loc[(regalHistData['Time Spent'] > 5400) & (regalHistData['Time Spent'] < 9000)].copy()
vsData = vsHistData.loc[(vsHistData['Time Spent'] > 5400) & (vsHistData['Time Spent'] < 9000)].copy()


trace1 = go.Bar(
    x=regalData['Time Spent'],
    y=regalData['Visitor Count'],
    name='Regal Theater')

trace2 = go.Bar(
    x=vsData['Time Spent'],
    y=vsData['Visitor Count'],
    name="Victori's Secret")

data = [trace1,trace2]

layout = go.Layout(
    #title='''Distribution of Visitors for Victoria's Secret and Regal Theaters<br>Ninety to One-hundred and Fifty Minutes''',
    xaxis=dict(title='Time Spent'),
    yaxis=dict(title='Number of Visitors'),
    barmode='group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Time Spent Histogram')

In [17]:
vsHistData.head(-5)

Unnamed: 0,Time Spent,Visitor Count
0,0,14
1,10,41429
2,20,11913
3,30,7580
4,40,6309
5,50,4565
6,60,5065
7,70,2608
8,80,2108
9,90,2467


In [None]:
tTest

In [None]:
print k

In [None]:
import scipy

In [None]:
import math

math.sqrt(9)

In [None]:
regal['dwell_time_in_sec'].var()

In [None]:
rawExtract[['current_zone_grouped']] == 'Regal Theater'

In [None]:
rawExtract.groupby('current_zone_grouped')['dwell_time_in_sec'].mean()

In [None]:
import numpy as np,statsmodels.stats.api as sms