# Bonus: Temperature Analysis I

In [71]:
import pandas as pd
import scipy.stats as stats
from datetime import datetime as dt

In [72]:
# "tobs" is "temperature observations"
df_temperature = pd.read_csv('Resources/hawaii_measurements.csv')
df_temperature.sample(20)

Unnamed: 0,station,date,prcp,tobs
9917,USC00519523,2011-02-07,0.95,67
11174,USC00519523,2014-08-25,0.0,78
7016,USC00514830,2015-08-29,0.0,85
19525,USC00516128,2017-07-29,0.3,77
18873,USC00516128,2015-09-01,0.07,77
5189,USC00513117,2016-11-30,0.05,74
17595,USC00516128,2011-11-21,1.15,72
3304,USC00513117,2011-08-07,0.01,76
2000,USC00519397,2015-08-26,0.62,77
15605,USC00511918,2011-10-26,0.01,69


In [73]:
#calling the info function I see that the dates are objects and I need to convert the dates.
df_temperature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   station  19550 non-null  object 
 1   date     19550 non-null  object 
 2   prcp     18103 non-null  float64
 3   tobs     19550 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 611.1+ KB


In [74]:
# Convert the date column format from object to datetime
df_temperature['date']=df_temperature['date'].astype('datetime64[ns]')
df_temperature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   station  19550 non-null  object        
 1   date     19550 non-null  datetime64[ns]
 2   prcp     18103 non-null  float64       
 3   tobs     19550 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 611.1+ KB


In [75]:
#Create a column months to be able to compare months across the years
df_temperature['month']= df_temperature['date'].dt.month
df_temperature

Unnamed: 0,station,date,prcp,tobs,month
0,USC00519397,2010-01-01,0.08,65,1
1,USC00519397,2010-01-02,0.00,63,1
2,USC00519397,2010-01-03,0.00,74,1
3,USC00519397,2010-01-04,0.00,76,1
4,USC00519397,2010-01-06,,73,1
...,...,...,...,...,...
19545,USC00516128,2017-08-19,0.09,71,8
19546,USC00516128,2017-08-20,,78,8
19547,USC00516128,2017-08-21,0.56,76,8
19548,USC00516128,2017-08-22,0.50,76,8


In [85]:
#Filter for June
df_june=df_temperature[(df_temperature.month == 6)]
june_temp=df_june.tobs
june_temp

133      78
134      76
135      78
136      76
137      77
         ..
19492    79
19493    74
19494    74
19495    76
19496    75
Name: tobs, Length: 1700, dtype: int64

In [86]:
june_temp.describe()

count    1700.000000
mean       74.944118
std         3.257417
min        64.000000
25%        73.000000
50%        75.000000
75%        77.000000
max        85.000000
Name: tobs, dtype: float64

In [114]:
#Filter for December
df_december=df_temperature[(df_temperature.month == 12)]
december_temp=df_december.tobs
december_temp

305      76
306      74
307      74
308      64
309      64
         ..
19323    71
19324    71
19325    69
19326    65
19327    65
Name: tobs, Length: 1517, dtype: int64

In [79]:
december_temp.describe()

count    1517.000000
mean       71.041529
std         3.745920
min        56.000000
25%        69.000000
50%        71.000000
75%        74.000000
max        83.000000
Name: tobs, dtype: float64

In [58]:
# Set the date column as the DataFrame index
df_temperature.set_index(df_temperature['date'], inplace=True)
df_temperature

Unnamed: 0_level_0,station,date,prcp,tobs,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01,USC00519397,2010-01-01,0.08,65,1
2010-01-02,USC00519397,2010-01-02,0.00,63,1
2010-01-03,USC00519397,2010-01-03,0.00,74,1
2010-01-04,USC00519397,2010-01-04,0.00,76,1
2010-01-06,USC00519397,2010-01-06,,73,1
...,...,...,...,...,...
2017-08-19,USC00516128,2017-08-19,0.09,71,8
2017-08-20,USC00516128,2017-08-20,,78,8
2017-08-21,USC00516128,2017-08-21,0.56,76,8
2017-08-22,USC00516128,2017-08-22,0.50,76,8


In [59]:
# Drop the date column
del df_temperature['date']

In [60]:
df_temperature

Unnamed: 0_level_0,station,prcp,tobs,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,USC00519397,0.08,65,1
2010-01-02,USC00519397,0.00,63,1
2010-01-03,USC00519397,0.00,74,1
2010-01-04,USC00519397,0.00,76,1
2010-01-06,USC00519397,,73,1
...,...,...,...,...
2017-08-19,USC00516128,0.09,71,8
2017-08-20,USC00516128,,78,8
2017-08-21,USC00516128,0.56,76,8
2017-08-22,USC00516128,0.50,76,8


### Compare June and December data across all years 

In [105]:
#See how many data points I have. If they are different, this lead to the decision of what kind of test I will run. 
#there are more data points in June than in December so I need to run an unpaired t-test
june_temp.count()

1700

In [106]:
december_temp.count()

1517

In [107]:
# Average June
june_temp.mean()

74.94411764705882

In [108]:
#Average December
december_temp.mean()

71.04152933421226

### Analysis

In [111]:
#What are the hypotheses of an unpaired t-test?
#The null hypothesis (H0) states that there is no significant difference between the means of the two groups.
#The alternative hypothesis (H1) states that there is a significant difference between the two population means, 
#and that this difference is unlikely to be caused by sampling error or chance.

### Considerations

In [112]:
#Unpaired (independent) t-test.Compare the means of two groups at 2 different point of time with different datapoints.
#If the pvalue is smaller 0.05, we can reject the null hypothesis and say statistical meaninful high difference between these groups 
stats.ttest_ind(june_temp, december_temp)

Ttest_indResult(statistic=31.60372399000329, pvalue=3.9025129038616655e-191)

In [None]:
# Close Session
session.close()