# *Reshaping Data for Analysis*

In [None]:
import pandas as pd

long_df = pd.read_csv('/content/long_data.csv').rename(columns = {'value' : 'temp_C'}).assign(date = lambda x : pd.to_datetime(x.date),
                                                                                              temp_F = lambda x : (x.temp_C * 9/5) + 32)
long_df.head()


Unnamed: 0,attributes,datatype,date,station,temp_C,temp_F
0,",,H,0700",TMAX,2018-10-01,GHCND:USC00280907,21.1,69.98
1,",,H,0700",TMIN,2018-10-01,GHCND:USC00280907,8.9,48.02
2,",,H,0700",TOBS,2018-10-01,GHCND:USC00280907,13.9,57.02
3,",,H,0700",TMAX,2018-10-02,GHCND:USC00280907,23.9,75.02
4,",,H,0700",TMIN,2018-10-02,GHCND:USC00280907,13.9,57.02


In [None]:
#Transposing swaps rows and columns
long_df.head().T


Unnamed: 0,0,1,2,3,4
attributes,",,H,0700",",,H,0700",",,H,0700",",,H,0700",",,H,0700"
datatype,TMAX,TMIN,TOBS,TMAX,TMIN
date,2018-10-01 00:00:00,2018-10-01 00:00:00,2018-10-01 00:00:00,2018-10-02 00:00:00,2018-10-02 00:00:00
station,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907
temp_C,21.1,8.9,13.9,23.9,13.9
temp_F,69.98,48.02,57.02,75.02,57.02


In [None]:
#Using the Pivot Method or pivot() to replace current index with a column with unique values
pivoted_df = long_df.pivot(index = 'date', columns = 'datatype', values = 'temp_C')
pivoted_df.head()

datatype,TMAX,TMIN,TOBS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-10-01,21.1,8.9,13.9
2018-10-02,23.9,13.9,17.2
2018-10-03,25.0,15.6,16.1
2018-10-04,22.8,11.7,11.7
2018-10-05,23.3,11.7,18.9


In [None]:
pivoted_df.describe()


datatype,TMAX,TMIN,TOBS
count,31.0,31.0,31.0
mean,16.829032,7.56129,10.022581
std,5.714962,6.513252,6.59655
min,7.8,-1.1,-1.1
25%,12.75,2.5,5.55
50%,16.1,6.7,8.3
75%,21.95,13.6,16.1
max,26.7,17.8,21.7


In [None]:
pivoted_df = long_df.pivot(index = 'date', columns = 'datatype', values = ['temp_C', 'temp_F'])
pivoted_df.head()

Unnamed: 0_level_0,temp_C,temp_C,temp_C,temp_F,temp_F,temp_F
datatype,TMAX,TMIN,TOBS,TMAX,TMIN,TOBS
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2018-10-01,21.1,8.9,13.9,69.98,48.02,57.02
2018-10-02,23.9,13.9,17.2,75.02,57.02,62.96
2018-10-03,25.0,15.6,16.1,77.0,60.08,60.98
2018-10-04,22.8,11.7,11.7,73.04,53.06,53.06
2018-10-05,23.3,11.7,18.9,73.94,53.06,66.02


In [None]:
pivoted_df['temp_C']['TMAX'].head()

date
2018-10-01    21.1
2018-10-02    23.9
2018-10-03    25.0
2018-10-04    22.8
2018-10-05    23.3
Name: TMAX, dtype: float64

In [None]:
multi_index_df  = long_df.set_index(['date', 'datatype'])
multi_index_df.index

MultiIndex([('2018-10-01', 'TMAX'),
            ('2018-10-01', 'TMIN'),
            ('2018-10-01', 'TOBS'),
            ('2018-10-02', 'TMAX'),
            ('2018-10-02', 'TMIN'),
            ('2018-10-02', 'TOBS'),
            ('2018-10-03', 'TMAX'),
            ('2018-10-03', 'TMIN'),
            ('2018-10-03', 'TOBS'),
            ('2018-10-04', 'TMAX'),
            ('2018-10-04', 'TMIN'),
            ('2018-10-04', 'TOBS'),
            ('2018-10-05', 'TMAX'),
            ('2018-10-05', 'TMIN'),
            ('2018-10-05', 'TOBS'),
            ('2018-10-06', 'TMAX'),
            ('2018-10-06', 'TMIN'),
            ('2018-10-06', 'TOBS'),
            ('2018-10-07', 'TMAX'),
            ('2018-10-07', 'TMIN'),
            ('2018-10-07', 'TOBS'),
            ('2018-10-08', 'TMAX'),
            ('2018-10-08', 'TMIN'),
            ('2018-10-08', 'TOBS'),
            ('2018-10-09', 'TMAX'),
            ('2018-10-09', 'TMIN'),
            ('2018-10-09', 'TOBS'),
            ('2018-10-10', '

In [None]:
multi_index_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,attributes,station,temp_C,temp_F
date,datatype,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-10-01,TMAX,",,H,0700",GHCND:USC00280907,21.1,69.98
2018-10-01,TMIN,",,H,0700",GHCND:USC00280907,8.9,48.02
2018-10-01,TOBS,",,H,0700",GHCND:USC00280907,13.9,57.02
2018-10-02,TMAX,",,H,0700",GHCND:USC00280907,23.9,75.02
2018-10-02,TMIN,",,H,0700",GHCND:USC00280907,13.9,57.02


In [None]:
extra_data = long_df.append([{'datatype' : 'TAVG', 'date' : '2018-10-01', 'temp_C' : 10, 'temp_F' : 50}]).set_index(['date' , 'datatype']).sort_index()
extra_data.head(8)

  extra_data = long_df.append([{'datatype' : 'TAVG', 'date' : '2018-10-01', 'temp_C' : 10, 'temp_F' : 50}]).set_index(['date' , 'datatype']).sort_index()
  extra_data = long_df.append([{'datatype' : 'TAVG', 'date' : '2018-10-01', 'temp_C' : 10, 'temp_F' : 50}]).set_index(['date' , 'datatype']).sort_index()


Unnamed: 0_level_0,Unnamed: 1_level_0,attributes,station,temp_C,temp_F
date,datatype,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-10-01,TAVG,,,10.0,50.0
2018-10-01,TMAX,",,H,0700",GHCND:USC00280907,21.1,69.98
2018-10-01,TMIN,",,H,0700",GHCND:USC00280907,8.9,48.02
2018-10-01,TOBS,",,H,0700",GHCND:USC00280907,13.9,57.02
2018-10-02,TMAX,",,H,0700",GHCND:USC00280907,23.9,75.02
2018-10-02,TMIN,",,H,0700",GHCND:USC00280907,13.9,57.02
2018-10-02,TOBS,",,H,0700",GHCND:USC00280907,17.2,62.96
2018-10-03,TMAX,",,H,0700",GHCND:USC00280907,25.0,77.0


In [None]:
extra_data.unstack().head()

Unnamed: 0_level_0,attributes,attributes,attributes,attributes,station,station,station,station,temp_C,temp_C,temp_C,temp_C,temp_F,temp_F,temp_F,temp_F
datatype,TAVG,TMAX,TMIN,TOBS,TAVG,TMAX,TMIN,TOBS,TAVG,TMAX,TMIN,TOBS,TAVG,TMAX,TMIN,TOBS
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
2018-10-01,,",,H,0700",",,H,0700",",,H,0700",,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907,10.0,21.1,8.9,13.9,50.0,69.98,48.02,57.02
2018-10-02,,",,H,0700",",,H,0700",",,H,0700",,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907,,23.9,13.9,17.2,,75.02,57.02,62.96
2018-10-03,,",,H,0700",",,H,0700",",,H,0700",,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907,,25.0,15.6,16.1,,77.0,60.08,60.98
2018-10-04,,",,H,0700",",,H,0700",",,H,0700",,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907,,22.8,11.7,11.7,,73.04,53.06,53.06
2018-10-05,,",,H,0700",",,H,0700",",,H,0700",,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907,,23.3,11.7,18.9,,73.94,53.06,66.02


In [None]:
extra_data.unstack(fill_value = -40).head()


Unnamed: 0_level_0,attributes,attributes,attributes,attributes,station,station,station,station,temp_C,temp_C,temp_C,temp_C,temp_F,temp_F,temp_F,temp_F
datatype,TAVG,TMAX,TMIN,TOBS,TAVG,TMAX,TMIN,TOBS,TAVG,TMAX,TMIN,TOBS,TAVG,TMAX,TMIN,TOBS
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
2018-10-01,,",,H,0700",",,H,0700",",,H,0700",,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907,10.0,21.1,8.9,13.9,50.0,69.98,48.02,57.02
2018-10-02,-40.0,",,H,0700",",,H,0700",",,H,0700",-40.0,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907,-40.0,23.9,13.9,17.2,-40.0,75.02,57.02,62.96
2018-10-03,-40.0,",,H,0700",",,H,0700",",,H,0700",-40.0,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907,-40.0,25.0,15.6,16.1,-40.0,77.0,60.08,60.98
2018-10-04,-40.0,",,H,0700",",,H,0700",",,H,0700",-40.0,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907,-40.0,22.8,11.7,11.7,-40.0,73.04,53.06,53.06
2018-10-05,-40.0,",,H,0700",",,H,0700",",,H,0700",-40.0,GHCND:USC00280907,GHCND:USC00280907,GHCND:USC00280907,-40.0,23.3,11.7,18.9,-40.0,73.94,53.06,66.02


In [None]:
wide_df = pd.read_csv('/content/wide_data.csv')
wide_df.head()

Unnamed: 0,date,TMAX,TMIN,TOBS
0,2018-10-01,21.1,8.9,13.9
1,2018-10-02,23.9,13.9,17.2
2,2018-10-03,25.0,15.6,16.1
3,2018-10-04,22.8,11.7,11.7
4,2018-10-05,23.3,11.7,18.9


In [None]:
melted_df = wide_df.melt(
  id_vars='date',
  value_vars=['TMAX', 'TMIN', 'TOBS'],
  value_name='temp_C',
  var_name='measurement'
)
melted_df.head()

Unnamed: 0,date,measurement,temp_C
0,2018-10-01,TMAX,21.1
1,2018-10-02,TMAX,23.9
2,2018-10-03,TMAX,25.0
3,2018-10-04,TMAX,22.8
4,2018-10-05,TMAX,23.3


In [None]:
pd.melt(wide_df,
        id_vars = 'date',
        value_vars = ['TMAX', 'TMIN', 'TOBS'],
        value_name = 'temp_C',
        var_name = 'measurement'
        ).head()

Unnamed: 0,date,measurement,temp_C
0,2018-10-01,TMAX,21.1
1,2018-10-02,TMAX,23.9
2,2018-10-03,TMAX,25.0
3,2018-10-04,TMAX,22.8
4,2018-10-05,TMAX,23.3


In [None]:
wide_df.set_index('date', inplace = True)
wide_df.head()


Unnamed: 0_level_0,TMAX,TMIN,TOBS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-10-01,21.1,8.9,13.9
2018-10-02,23.9,13.9,17.2
2018-10-03,25.0,15.6,16.1
2018-10-04,22.8,11.7,11.7
2018-10-05,23.3,11.7,18.9


In [None]:
stacked_series = wide_df.stack()
stacked_series.head()

date            
2018-10-01  TMAX    21.1
            TMIN     8.9
            TOBS    13.9
2018-10-02  TMAX    23.9
            TMIN    13.9
dtype: float64

In [None]:
stacked_df = stacked_series.to_frame('values')
stacked_df

Unnamed: 0_level_0,Unnamed: 1_level_0,values
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-10-01,TMAX,21.1
2018-10-01,TMIN,8.9
2018-10-01,TOBS,13.9
2018-10-02,TMAX,23.9
2018-10-02,TMIN,13.9
...,...,...
2018-10-30,TMIN,2.2
2018-10-30,TOBS,5.0
2018-10-31,TMAX,12.2
2018-10-31,TMIN,0.0


In [None]:
stacked_df.index

MultiIndex([('2018-10-01', 'TMAX'),
            ('2018-10-01', 'TMIN'),
            ('2018-10-01', 'TOBS'),
            ('2018-10-02', 'TMAX'),
            ('2018-10-02', 'TMIN'),
            ('2018-10-02', 'TOBS'),
            ('2018-10-03', 'TMAX'),
            ('2018-10-03', 'TMIN'),
            ('2018-10-03', 'TOBS'),
            ('2018-10-04', 'TMAX'),
            ('2018-10-04', 'TMIN'),
            ('2018-10-04', 'TOBS'),
            ('2018-10-05', 'TMAX'),
            ('2018-10-05', 'TMIN'),
            ('2018-10-05', 'TOBS'),
            ('2018-10-06', 'TMAX'),
            ('2018-10-06', 'TMIN'),
            ('2018-10-06', 'TOBS'),
            ('2018-10-07', 'TMAX'),
            ('2018-10-07', 'TMIN'),
            ('2018-10-07', 'TOBS'),
            ('2018-10-08', 'TMAX'),
            ('2018-10-08', 'TMIN'),
            ('2018-10-08', 'TOBS'),
            ('2018-10-09', 'TMAX'),
            ('2018-10-09', 'TMIN'),
            ('2018-10-09', 'TOBS'),
            ('2018-10-10', '

In [None]:
stacked_df.index.names

FrozenList(['date', None])

In [None]:
stacked_df.index.rename(['date', 'datatype'], inplace= True)
stacked_df.index.names

FrozenList(['date', 'datatype'])