In [1]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
pd.set_option('display.max_rows', None)

In [2]:
# Reading data
filename = 'KTUS.csv'

In [3]:
# Read the data into a pandas dataframe, and define columns, header and index
df = pd.read_csv(filename, sep = ',',index_col='Date_Time',skiprows=7,
        names =['Station_ID','Date_Time',
                'air_temp_set_1_F','dew_point_temperature_set_1_F','precip_accum_one_hour_set_1_Inches',
                'dew_point_temperature_set_1d_F'])

# Rename columns, and index
df.columns = ['Station_ID','Air_Temp_F','Dewpoint_Temp_F','Precip_1Hour_Accum_Inches',
                'Dewpoint_Temp_set2_F']

# Set index data type to a DateTime object
df.index = pd.to_datetime(df.index)

In [4]:
# Make numpy lists of the data
Hour       = df.index.hour.tolist()
Station_ID = df.Station_ID.values.tolist()
Air_Temp_F = df.Air_Temp_F.values.tolist()
Precip_1Hour_Accum_Inches = df.Precip_1Hour_Accum_Inches.values.tolist()
Dewpoint_Temp_F = df.Dewpoint_Temp_set2_F.values.tolist()

In [5]:
# Calculating some basic properites
print(min(Air_Temp_F)) # np.min(Air_Temp_F) or Air_Temp_F.min()
print(max(Air_Temp_F)) # np.max(Air_Temp_F) or Air_Temp_F.max()
print(np.mean(Air_Temp_F))
print(np.std(Air_Temp_F))
print(stats.skew(Air_Temp_F))
print(stats.kurtosis(Air_Temp_F))

68.0
107.6
85.41520746432492
8.875432869627222
0.40619592098807733
-0.582210333914496


In [6]:
# Making and empty list to add the 
# index values I'm interested in to
ilist = []

# Loop over the length of the flow list
# and adding the index value to the ilist
# if it meets a specified criteria
# use and / or for multple conditions
for i in range(len(Air_Temp_F)):
        if Air_Temp_F[i] > 85 and 6 < Hour[i] < 18 :
                ilist.append(i)

# see how many times the criteria was met by checking the length
# of the index list that was generated
print(len(ilist),' of ',len(Air_Temp_F))

786  of  9110


In [7]:
# Do the same for loop but in one line 
ilist2 = [i for i in range(len(Air_Temp_F)) if Air_Temp_F[i] > 75 and 6 < Hour[i] < 18 ]
print(len(ilist2),' of ',len(Air_Temp_F))

3558  of  9110


In [8]:
# Extract the data that met the criteria
# This  subset of data is just the elements identified 
# in the ilist
subset  = [Air_Temp_F[j] for j in ilist]

In [9]:
# take a random sample of the data 
random_sample = np.random.choice(Air_Temp_F,size=len(subset))

In [11]:
# perfrom a t-test on the >75F subset versus the random sample
statistic, pvalue = stats.ttest_ind(a=subset, b=random_sample, equal_var=True)
print('statistic',statistic,'pvalue',pvalue)

statistic 11.811618620616914 pvalue 6.623899453845563e-31
