In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Datasets/amazon_fires.csv', encoding='ISO-8859-1')

df.head()

Unnamed: 0,ano,mes,estado,numero,encontro
0,1998,Janeiro,Acre,0 Fires,1/1/1998
1,1999,Janeiro,Acre,0 Fires,1/1/1999
2,2000,Janeiro,Acre,0 Fires,1/1/2000
3,2001,Janeiro,Acre,0 Fires,1/1/2001
4,2002,Janeiro,Acre,0 Fires,1/1/2002


In [3]:
len(df['estado'].unique())

23

### Renaming Columns

In [4]:
new_cols = {'ano': 'year',
           'estado': 'state',
           'mes': 'month',
           'numero': 'number_of_fires',
           'encontro': 'date'}

df.rename(columns=new_cols, inplace=True)

In [5]:
df.head()

Unnamed: 0,year,month,state,number_of_fires,date
0,1998,Janeiro,Acre,0 Fires,1/1/1998
1,1999,Janeiro,Acre,0 Fires,1/1/1999
2,2000,Janeiro,Acre,0 Fires,1/1/2000
3,2001,Janeiro,Acre,0 Fires,1/1/2001
4,2002,Janeiro,Acre,0 Fires,1/1/2002


In [6]:
# How many years of data do we have
df['year'].unique()

array([1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
       2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017], dtype=int64)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6454 entries, 0 to 6453
Data columns (total 5 columns):
year               6454 non-null int64
month              6454 non-null object
state              6454 non-null object
number_of_fires    6322 non-null object
date               6454 non-null object
dtypes: int64(1), object(4)
memory usage: 252.2+ KB


In [8]:
df.head()

Unnamed: 0,year,month,state,number_of_fires,date
0,1998,Janeiro,Acre,0 Fires,1/1/1998
1,1999,Janeiro,Acre,0 Fires,1/1/1999
2,2000,Janeiro,Acre,0 Fires,1/1/2000
3,2001,Janeiro,Acre,0 Fires,1/1/2001
4,2002,Janeiro,Acre,0 Fires,1/1/2002


### Rearranging Columns

In [9]:
new_order = [4, 1, 0, 2, 3]
df = df[df.columns[new_order]]

In [10]:
df.head()

Unnamed: 0,date,month,year,state,number_of_fires
0,1/1/1998,Janeiro,1998,Acre,0 Fires
1,1/1/1999,Janeiro,1999,Acre,0 Fires
2,1/1/2000,Janeiro,2000,Acre,0 Fires
3,1/1/2001,Janeiro,2001,Acre,0 Fires
4,1/1/2002,Janeiro,2002,Acre,0 Fires


### Determing if a column contain numeric data

In [11]:
df['number_of_fires'].str.isnumeric()

0       False
1       False
2       False
3       False
4       False
        ...  
6449     True
6450     True
6451     True
6452     True
6453     True
Name: number_of_fires, Length: 6454, dtype: object

In [12]:
df.tail()

Unnamed: 0,date,month,year,state,number_of_fires
6449,1/1/2012,Dezembro,2012,Tocantins,128
6450,1/1/2013,Dezembro,2013,Tocantins,85
6451,1/1/2014,Dezembro,2014,Tocantins,223
6452,1/1/2015,Dezembro,2015,Tocantins,373
6453,1/1/2016,Dezembro,2016,Tocantins,119


In [13]:
df[df['number_of_fires'].astype(str).str.isdigit()].tail()

Unnamed: 0,date,month,year,state,number_of_fires
6449,1/1/2012,Dezembro,2012,Tocantins,128
6450,1/1/2013,Dezembro,2013,Tocantins,85
6451,1/1/2014,Dezembro,2014,Tocantins,223
6452,1/1/2015,Dezembro,2015,Tocantins,373
6453,1/1/2016,Dezembro,2016,Tocantins,119


In [14]:
df['number_of_fires'].isnull().sum()

132

In [15]:
df['number_of_fires'] = df['number_of_fires'].str.strip(' Fires')

In [16]:
df.head()

Unnamed: 0,date,month,year,state,number_of_fires
0,1/1/1998,Janeiro,1998,Acre,0
1,1/1/1999,Janeiro,1999,Acre,0
2,1/1/2000,Janeiro,2000,Acre,0
3,1/1/2001,Janeiro,2001,Acre,0
4,1/1/2002,Janeiro,2002,Acre,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6454 entries, 0 to 6453
Data columns (total 5 columns):
date               6454 non-null object
month              6454 non-null object
year               6454 non-null int64
state              6454 non-null object
number_of_fires    6322 non-null object
dtypes: int64(1), object(4)
memory usage: 252.2+ KB


In [18]:
df['number_of_fires'] = df['number_of_fires'].str.replace('', '0').astype(float)

df.head()

Unnamed: 0,date,month,year,state,number_of_fires
0,1/1/1998,Janeiro,1998,Acre,0.0
1,1/1/1999,Janeiro,1999,Acre,0.0
2,1/1/2000,Janeiro,2000,Acre,0.0
3,1/1/2001,Janeiro,2001,Acre,0.0
4,1/1/2002,Janeiro,2002,Acre,0.0


### Handling missing data

In [19]:
df = pd.read_csv('Datasets/amazon_fires.csv', encoding='ISO-8859-1')
new_cols = {'ano': 'year',
           'estado': 'state',
           'mes': 'month',
           'numero': 'number_of_fires',
           'encontro': 'date'}

df.rename(columns=new_cols, inplace=True)
df['number_of_fires'] = df['number_of_fires'].str.strip(' Fires')
df_copy = df.copy()
df_copy.head()

Unnamed: 0,year,month,state,number_of_fires,date
0,1998,Janeiro,Acre,0,1/1/1998
1,1999,Janeiro,Acre,0,1/1/1999
2,2000,Janeiro,Acre,0,1/1/2000
3,2001,Janeiro,Acre,0,1/1/2001
4,2002,Janeiro,Acre,0,1/1/2002


In [20]:
df.isnull().sum()

year                 0
month                0
state                0
number_of_fires    132
date                 0
dtype: int64

In [21]:
df = df.dropna()

In [22]:
df.reset_index()

Unnamed: 0,index,year,month,state,number_of_fires,date
0,0,1998,Janeiro,Acre,0,1/1/1998
1,1,1999,Janeiro,Acre,0,1/1/1999
2,2,2000,Janeiro,Acre,0,1/1/2000
3,3,2001,Janeiro,Acre,0,1/1/2001
4,4,2002,Janeiro,Acre,0,1/1/2002
...,...,...,...,...,...,...
6317,6449,2012,Dezembro,Tocantins,128,1/1/2012
6318,6450,2013,Dezembro,Tocantins,85,1/1/2013
6319,6451,2014,Dezembro,Tocantins,223,1/1/2014
6320,6452,2015,Dezembro,Tocantins,373,1/1/2015


In [23]:
df.head()

Unnamed: 0,year,month,state,number_of_fires,date
0,1998,Janeiro,Acre,0,1/1/1998
1,1999,Janeiro,Acre,0,1/1/1999
2,2000,Janeiro,Acre,0,1/1/2000
3,2001,Janeiro,Acre,0,1/1/2001
4,2002,Janeiro,Acre,0,1/1/2002


In [24]:
df = pd.read_csv('Datasets/amazon_fires.csv', encoding='ISO-8859-1')
new_cols = {'ano': 'year',
           'estado': 'state',
           'mes': 'month',
           'numero': 'number_of_fires',
           'encontro': 'date'}

df.rename(columns=new_cols, inplace=True)
df['number_of_fires'] = df['number_of_fires'].str.strip(' Fires')
df_copy = df.copy()
df_copy.head()

Unnamed: 0,year,month,state,number_of_fires,date
0,1998,Janeiro,Acre,0,1/1/1998
1,1999,Janeiro,Acre,0,1/1/1999
2,2000,Janeiro,Acre,0,1/1/2000
3,2001,Janeiro,Acre,0,1/1/2001
4,2002,Janeiro,Acre,0,1/1/2002


In [25]:
df.isnull().sum()

year                 0
month                0
state                0
number_of_fires    132
date                 0
dtype: int64

In [26]:
df['number_of_fires'].fillna(0).head()

0    0
1    0
2    0
3    0
4    0
Name: number_of_fires, dtype: object

In [27]:
df['number_of_fires'] = df['number_of_fires'].fillna(method='backfill')

In [28]:
df.iloc[444]

year                   2002
month              Novembro
state               alagoas
number_of_fires          17
date               1/1/2002
Name: 444, dtype: object

In [30]:
df['month'].unique()

array(['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 'Julho',
       'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro'],
      dtype=object)

In [31]:
month_translations = {
    'Janeiro': 'January', 'Fevereiro': 'February', 'Março': 'March', 'Abril': 'April', 'Maio': 'May', 'Junho': 'June', 'Julho': 'July',
       'Agosto': 'August', 'Setembro': 'September', 'Outubro': 'October', 'Novembro': 'November', 'Dezembro': 'December'
}

df['month'] = df['month'].map(month_translations)

In [32]:
df.head()

Unnamed: 0,year,month,state,number_of_fires,date
0,1998,January,Acre,0,1/1/1998
1,1999,January,Acre,0,1/1/1999
2,2000,January,Acre,0,1/1/2000
3,2001,January,Acre,0,1/1/2001
4,2002,January,Acre,0,1/1/2002


In [38]:
df['state'] = df['state'].str.title()

In [39]:
df['state'].unique()

array(['Acre', 'Alagoas', 'Amapa', 'Amazonas', 'Bahia', 'Ceara',
       'Distrito Federal', 'Espirito Santo', 'Goias', 'Maranhao',
       'Mato Grosso', 'Minas Gerais', 'Pará', 'Paraiba', 'Pernambuco',
       'Piau', 'Rio', 'Rondonia', 'Roraima', 'Santa Catarina',
       'Sao Paulo', 'Sergipe', 'Tocantins'], dtype=object)

In [40]:
df.head()

Unnamed: 0,year,month,state,number_of_fires,date
0,1998,January,Acre,0,1/1/1998
1,1999,January,Acre,0,1/1/1999
2,2000,January,Acre,0,1/1/2000
3,2001,January,Acre,0,1/1/2001
4,2002,January,Acre,0,1/1/2002


In [44]:
df.drop(df.index[[0, 1]], inplace=True)
df.reset_index()

Unnamed: 0,index,year,month,state,number_of_fires,date
0,3,2001,January,Acre,0,1/1/2001
1,4,2002,January,Acre,0,1/1/2002
2,5,2003,January,Acre,10,1/1/2003
3,6,2004,January,Acre,0,1/1/2004
4,7,2005,January,Acre,12,1/1/2005
...,...,...,...,...,...,...
6446,6449,2012,December,Tocantins,128,1/1/2012
6447,6450,2013,December,Tocantins,85,1/1/2013
6448,6451,2014,December,Tocantins,223,1/1/2014
6449,6452,2015,December,Tocantins,373,1/1/2015
