# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('./Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   station  19550 non-null  object 
 1   date     19550 non-null  object 
 2   prcp     18103 non-null  float64
 3   tobs     19550 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 611.1+ KB


In [4]:
# Convert the date column format from string to datetime
df['Date']= pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   station  19550 non-null  object        
 1   date     19550 non-null  object        
 2   prcp     18103 non-null  float64       
 3   tobs     19550 non-null  int64         
 4   Date     19550 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 763.8+ KB


In [5]:
# Set the date column as the DataFrame index
df.set_index(['Date'])

Unnamed: 0_level_0,station,date,prcp,tobs
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,USC00519397,2010-01-01,0.08,65
2010-01-02,USC00519397,2010-01-02,0.00,63
2010-01-03,USC00519397,2010-01-03,0.00,74
2010-01-04,USC00519397,2010-01-04,0.00,76
2010-01-06,USC00519397,2010-01-06,,73
...,...,...,...,...
2017-08-19,USC00516128,2017-08-19,0.09,71
2017-08-20,USC00516128,2017-08-20,,78
2017-08-21,USC00516128,2017-08-21,0.56,76
2017-08-22,USC00516128,2017-08-22,0.50,76


In [6]:
# Drop the date column
clean_df = df.drop("date",1)
clean_df

Unnamed: 0,station,prcp,tobs,Date
0,USC00519397,0.08,65,2010-01-01
1,USC00519397,0.00,63,2010-01-02
2,USC00519397,0.00,74,2010-01-03
3,USC00519397,0.00,76,2010-01-04
4,USC00519397,,73,2010-01-06
...,...,...,...,...
19545,USC00516128,0.09,71,2017-08-19
19546,USC00516128,,78,2017-08-20
19547,USC00516128,0.56,76,2017-08-21
19548,USC00516128,0.50,76,2017-08-22


### Compare June and December data across all years 

In [7]:
from scipy import stats

In [58]:
# Filter data for desired months

june_df = clean_df[clean_df['Date'].dt.month_name() == "June"]
dec_df = clean_df[clean_df['Date'].dt.month_name() == "December"]
display(june_df,dec_df)

Unnamed: 0,station,prcp,tobs,Date
133,USC00519397,0.00,78,2010-06-01
134,USC00519397,0.01,76,2010-06-02
135,USC00519397,0.00,78,2010-06-03
136,USC00519397,0.00,76,2010-06-04
137,USC00519397,0.00,77,2010-06-05
...,...,...,...,...
19492,USC00516128,0.02,79,2017-06-26
19493,USC00516128,0.10,74,2017-06-27
19494,USC00516128,0.02,74,2017-06-28
19495,USC00516128,0.04,76,2017-06-29


Unnamed: 0,station,prcp,tobs,Date
305,USC00519397,0.04,76,2010-12-01
306,USC00519397,0.00,74,2010-12-03
307,USC00519397,0.00,74,2010-12-04
308,USC00519397,0.00,64,2010-12-06
309,USC00519397,0.00,64,2010-12-07
...,...,...,...,...
19323,USC00516128,0.14,71,2016-12-27
19324,USC00516128,0.14,71,2016-12-28
19325,USC00516128,1.03,69,2016-12-29
19326,USC00516128,2.37,65,2016-12-30


In [59]:
# Identify the average temperature for June
jun_avg_temp = june_df["tobs"].mean()

jun_avg_temp


74.94411764705882

In [60]:
# Identify the average temperature for December
dec_avg_temp = dec_df["tobs"].mean()
dec_avg_temp


71.04152933421226

In [80]:
# Create collections of temperature data
jun_temp = june_df[["Date","tobs"]]
dec_temp = dec_df[["Date","tobs"]]
dec_temp

Unnamed: 0,Date,tobs
305,2010-12-01,76
306,2010-12-03,74
307,2010-12-04,74
308,2010-12-06,64
309,2010-12-07,64
...,...,...
19323,2016-12-27,71
19324,2016-12-28,71
19325,2016-12-29,69
19326,2016-12-30,65


In [86]:
# Run unpaired t-test

t, p = ttest_ind(jun_temp['tobs'], dec_temp['tobs'], equal_var=False)
display (t,p)

31.355036920962423

4.193529835915755e-187

### Analysis

In [89]:
june_df.describe()


Unnamed: 0,prcp,tobs
count,1574.0,1700.0
mean,0.13636,74.944118
std,0.335731,3.257417
min,0.0,64.0
25%,0.0,73.0
50%,0.02,75.0
75%,0.12,77.0
max,4.43,85.0


In [90]:
dec_df.describe()

Unnamed: 0,prcp,tobs
count,1405.0,1517.0
mean,0.216819,71.041529
std,0.541399,3.74592
min,0.0,56.0
25%,0.0,69.0
50%,0.03,71.0
75%,0.15,74.0
max,6.42,83.0


The analysis show that in June the minimum recorded temperature was 64, maximum 85 and average 74.
For December the minimum was 56, maximum 83 and average 71.

While performing a t-test the paired T-test is not recommended because the size of the two datasets is different with more records in one compared to teh other one.