# INTRODUCTION TO PANDAS

### PART II

In [None]:
import numpy  as np
import pandas as pd

## 2.5 Read an Excel file into a pandas DataFrame

##### > Read Excel File

In [None]:
pd.read_excel('Test.xlsx')

#### > Skip rows at the beginning (0-indexed)

In [None]:
pd.read_excel('Test.xlsx', skiprows = 4)

#### > Select a subset of columns

In [None]:
pd.read_excel('Test.xlsx', skiprows = 4, usecols = 'E:J') # Not supported in old versions

In [None]:
pd.read_excel('Test.xlsx', skiprows = 4,  usecols = [4,5,6,7,8,9])

In [None]:
columns = list(range(4,10))
pd.read_excel('Test.xlsx', skiprows = 4,  usecols = columns)

##### > Skip rows starting from the end (0-indexed)

In [None]:
pd.read_excel('Test.xlsx', skiprows = 4, usecols = 'E:J', skipfooter = 0) # Test with skipfooter = 5

##### > Column to use as the row labels of the DataFrame (0-indexed)

In [None]:
df = pd.read_excel('Test.xlsx', skiprows = 4, skipfooter = 0, usecols = 'E:J', index_col = 0)
df

In [None]:
df.index

In [None]:
df.columns

##### > Row to use for the column labels of the parsed DataFrame (0-indexed)

In [None]:
df = pd.read_excel('Test.xlsx', skiprows = 4, skipfooter = 0, usecols = 'E:J', index_col = 0, header = 1)
df

In [None]:
df.index

In [None]:
df.columns

##### > List of column names to use

In [None]:
columnNames = ['Row/Column', 'COL1', 'COL2', 'COL3', 'COL4', 'COL5']
pd.read_excel('Test.xlsx', skiprows = 4, skipfooter = 0, usecols = 'E:J', index_col = 0, header = 1, 
              names = columnNames)

##### > Any column of the DataFrame can be set also as index using set_index()

In [None]:
df.set_index(['C4'], drop = True, append = False, inplace = False) 
# df.set_index(['C4'], drop = False, append = True, inplace = True)

#    EXAMPLE                                      

In [None]:
#! type data_AEMET.csv

##### > Read data (from a csv file)

In [None]:
pd.read_csv('data_AEMET.csv').head()

##### > Separate records

In [None]:
pd.read_csv('data_AEMET.csv', delimiter= '\t').head()

##### > Select columns [fecha nombre prec presMax presMin sol tmax tmed tmin velmedia]

In [None]:
pd.read_csv('data_AEMET.csv', delimiter= '\t', usecols = [1, 3, 4, 5, 6, 9, 10, 11, 12, 13]).head()

##### > Select dates as index

In [None]:
pd.read_csv('data_AEMET.csv', delimiter= '\t', usecols = [1, 3, 4, 5, 6, 9, 10, 11, 12, 13],
             index_col = 0).head()

##### > Modify column labels 
names = [ 'DATE', 'NAME', 'PRECIP', 'PMAX', 'PMIN', 'SOL', 'TMAX', 'TMED', 'TMIN', 'VMED']

In [None]:
pd.read_csv('data_AEMET.csv', delimiter= '\t', usecols = [1, 3, 4, 5, 6, 9, 10, 11, 12, 13],
             index_col = 0,
             names = [ 'DATE', 'NAME', 'PRECIP', 'PMAX', 'PMIN', 'SOL', 'TMAX', 'TMED', 'TMIN', 'VMED']).head()

##### > Remove first row

In [None]:
df = pd.read_csv('data_AEMET.csv', delimiter= '\t', usecols = [1, 3, 4, 5, 6, 9, 10, 11, 12, 13],
             index_col = 0,
             names = [ 'DATE', 'NAME', 'PRECIP', 'PMAX', 'PMIN', 'SOL', 'TMAX', 'TMED', 'TMIN', 'VMED'],
             skiprows = 1)
df.head()

## 2.6 Exploring the data

##### > Data types of each column

In [None]:
df.dtypes

##### > Index

In [None]:
df.index

###### > Columns

In [None]:
df.columns

###### > Show the first 10 rows

In [None]:
df.head(10)

##### >  Show the last 10 rows

In [None]:
df.tail(10)

##### > Data frame info

In [None]:
df.info()

##### > Data frame shape

In [None]:
df.shape

##### > Total number of empty elements in each row

In [None]:
df.isnull().sum()

##### > Total number of elements with value

In [None]:
df.count(axis = 0)

## 2.7 Some statistical data

In [None]:
df.describe()

##### > Fill Na/NaN elements (empty ellements)

In [None]:
df.head()

In [None]:
df.fillna(0).head()

In [None]:
df.head() # Changes are not saved in DataFrame

In [None]:
# inplace option
df.fillna(0, inplace = True)
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

##### > Single Statistical data

In [None]:
df.mean()

## 2.8 Sort Data

In [None]:
df.head()

##### > Sort data frame by index labels

In [None]:
df.sort_index(ascending = False).head()

In [None]:
df.head() # inplace = False !

In [None]:
# Inplace
df.sort_index( ascending = True, inplace = True)
df.head()

##### > Sort rows by the values of any column

In [None]:
df.sort_values('TMAX', ascending = False).head()

## 2.9 Data filtering

##### > By value ranges

In [None]:
df[df['TMAX']> 37].head()

In [None]:
test = df[df['TMAX']> 37]
test.shape

In [None]:
# 37< TMAX < 40
df[(37<df['TMAX']) & (df['TMAX']< 40)].head()

In [None]:
test = df[(37<df['TMAX']) & (df['TMAX']< 40)]
test.shape

## 2.10 Plotting Data

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

##### > Simple Plot

In [None]:
# Select Asturias Airport
#df_1 =

In [None]:
ax = df_1[['TMAX', 'TMIN', 'TMED']].plot()
ax.set(xlabel='Date', ylabel='Temperature (C)', title='TEMPERATURE vs DATE')

ax.grid()

In [None]:
ax = df_1[['TMAX', 'TMIN', 'TMED']]
plt.figure()
ax.plot()
plt.ylim([0, 40])
plt.ylabel('Temperature (C)')
plt.xlabel('DATA')
plt.title('TEMPERATURE vs DATE')
plt.grid()
plt.savefig('fig2.png', dpi = 300)

##### > Bar plots

In [None]:
plt.figure()
df_1.head()

In [None]:
df_1.loc[['2017-01-01', '2017-01-02'], ['TMAX', 'TMIN', 'TMED']].plot(kind = 'bar', stacked = False)

In [None]:
df_1.loc[['2017-01-01', '2017-01-02'],['TMAX', 'TMIN', 'TMED']].plot(kind = 'bar', stacked = True)