In [2]:
import pandas as pd

file_name = "https://raw.githubusercontent.com/rajeevratan84/datascienceforbusiness/master/amazon_fires.csv"
df = pd.read_csv(file_name, encoding = "ISO-8859-1")

df.tail()

Unnamed: 0,ano,mes,estado,numero,encontro
6449,2012,Dezembro,Tocantins,128,1/1/2012
6450,2013,Dezembro,Tocantins,85,1/1/2013
6451,2014,Dezembro,Tocantins,223,1/1/2014
6452,2015,Dezembro,Tocantins,373,1/1/2015
6453,2016,Dezembro,Tocantins,119,1/1/2016


In [3]:
df.head()

Unnamed: 0,ano,mes,estado,numero,encontro
0,1998,Janeiro,Acre,0 Fires,1/1/1998
1,1999,Janeiro,Acre,0 Fires,1/1/1999
2,2000,Janeiro,Acre,0 Fires,1/1/2000
3,2001,Janeiro,Acre,0 Fires,1/1/2001
4,2002,Janeiro,Acre,0 Fires,1/1/2002


### Renaming Columns

In [6]:
new_columns = {'ano' : 'year',
               'estado': 'state',
               'mes': 'month',
               'numero': 'number_of_fires',
               'encontro': 'date'}

df.rename(columns = new_columns, inplace=True)

In [10]:
#Changing a single column name
new_trail =df.rename(columns={'state' : 'states'}) 
new_trail

Unnamed: 0,year,month,states,number_of_fires,date
0,1998,Janeiro,Acre,0 Fires,1/1/1998
1,1999,Janeiro,Acre,0 Fires,1/1/1999
2,2000,Janeiro,Acre,0 Fires,1/1/2000
3,2001,Janeiro,Acre,0 Fires,1/1/2001
4,2002,Janeiro,Acre,0 Fires,1/1/2002
...,...,...,...,...,...
6449,2012,Dezembro,Tocantins,128,1/1/2012
6450,2013,Dezembro,Tocantins,85,1/1/2013
6451,2014,Dezembro,Tocantins,223,1/1/2014
6452,2015,Dezembro,Tocantins,373,1/1/2015


In [11]:
df.head()

Unnamed: 0,year,month,state,number_of_fires,date
0,1998,Janeiro,Acre,0 Fires,1/1/1998
1,1999,Janeiro,Acre,0 Fires,1/1/1999
2,2000,Janeiro,Acre,0 Fires,1/1/2000
3,2001,Janeiro,Acre,0 Fires,1/1/2001
4,2002,Janeiro,Acre,0 Fires,1/1/2002


In [8]:
# How many years of data do we have?
df['year'].unique()

array([1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
       2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017], dtype=int64)

# Re-arranging columns

In [12]:
# Columns are numbered from 0, left to right
# Let's put date first, month second and year 3rd

new_order = [4,1,0,2,3,]
df = df[df.columns[new_order]]
df.head()

Unnamed: 0,date,month,year,state,number_of_fires
0,1/1/1998,Janeiro,1998,Acre,0 Fires
1,1/1/1999,Janeiro,1999,Acre,0 Fires
2,1/1/2000,Janeiro,2000,Acre,0 Fires
3,1/1/2001,Janeiro,2001,Acre,0 Fires
4,1/1/2002,Janeiro,2002,Acre,0 Fires


In [13]:
# It isn't, let's find our why

df['number_of_fires'].str.isnumeric()

0       False
1       False
2       False
3       False
4       False
        ...  
6449     True
6450     True
6451     True
6452     True
6453     True
Name: number_of_fires, Length: 6454, dtype: object

### Strip - Return a copy of the string with leading and trailing characters removed. If chars is omitted or None, whitespace characters are removed. If given and not None, chars must be a string; the characters in the string will be stripped from the both ends of the string this method is called on.



In [21]:
# To replace column with cleaned column
df['number_of_fires'] =df['number_of_fires'].str.strip(" Fires")
df.head()

Unnamed: 0,date,month,year,state,number_of_fires
0,1/1/1998,Janeiro,1998,Acre,0
1,1/1/1999,Janeiro,1999,Acre,0
2,1/1/2000,Janeiro,2000,Acre,0
3,1/1/2001,Janeiro,2001,Acre,0
4,1/1/2002,Janeiro,2002,Acre,0


In [23]:
# We need to convert our number_of_fires column to a float data type
# Also, here's an alternative string manipulation technique we can use

df["number_of_fires"] = df["number_of_fires"].str.replace('','0').astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6454 entries, 0 to 6453
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             6454 non-null   object 
 1   month            6454 non-null   object 
 2   year             6454 non-null   int64  
 3   state            6454 non-null   object 
 4   number_of_fires  6322 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 252.2+ KB


In [24]:
# Let's reload our dataframe
file_name = "https://raw.githubusercontent.com/rajeevratan84/datascienceforbusiness/master/amazon_fires.csv"
df = pd.read_csv(file_name, encoding = "ISO-8859-1")
new_columns = {'ano' : 'year',
               'estado': 'state',
               'mes': 'month',
               'numero': 'number_of_fires',
               'encontro': 'date'}
df.rename(columns = new_columns, inplace=True)
df['number_of_fires'] = df['number_of_fires'].str.strip(" Fires")
# Creating a true copy of our dataframe
df_copy = df.copy()
df.head()

Unnamed: 0,year,month,state,number_of_fires,date
0,1998,Janeiro,Acre,0,1/1/1998
1,1999,Janeiro,Acre,0,1/1/1999
2,2000,Janeiro,Acre,0,1/1/2000
3,2001,Janeiro,Acre,0,1/1/2001
4,2002,Janeiro,Acre,0,1/1/2002


In [25]:
# Viewing the sum of missing values in each column

df.isnull().sum()

year                 0
month                0
state                0
number_of_fires    132
date                 0
dtype: int64

In [26]:
# We can easily remove Null or NaN (not a number) values 

# Drop rows with NaN values
df = df.dropna() 
df = df.reset_index() # reset's row indexes in case any rows were dropped
df.head()

Unnamed: 0,index,year,month,state,number_of_fires,date
0,0,1998,Janeiro,Acre,0,1/1/1998
1,1,1999,Janeiro,Acre,0,1/1/1999
2,2,2000,Janeiro,Acre,0,1/1/2000
3,3,2001,Janeiro,Acre,0,1/1/2001
4,4,2002,Janeiro,Acre,0,1/1/2002


In [27]:
# Let's check and see it worked

df.isnull().sum()

index              0
year               0
month              0
state              0
number_of_fires    0
date               0
dtype: int64

In [28]:
# Alright so it worked, now let's reload the data and look at a few other methods of dealing with NaN or Null values

# Let's reload our dataframe
file_name = "https://raw.githubusercontent.com/rajeevratan84/datascienceforbusiness/master/amazon_fires.csv"
df = pd.read_csv(file_name, encoding = "ISO-8859-1")
new_columns = {'ano' : 'year',
               'estado': 'state',
               'mes': 'month',
               'numero': 'number_of_fires',
               'encontro': 'date'}
df.rename(columns = new_columns, inplace=True)
df['number_of_fires'] = df['number_of_fires'].str.strip(" Fires")
df_copy = df.copy()
df.head()

Unnamed: 0,year,month,state,number_of_fires,date
0,1998,Janeiro,Acre,0,1/1/1998
1,1999,Janeiro,Acre,0,1/1/1999
2,2000,Janeiro,Acre,0,1/1/2000
3,2001,Janeiro,Acre,0,1/1/2001
4,2002,Janeiro,Acre,0,1/1/2002


In [29]:
# Create a boolean index for all null values

df['number_of_fires'].isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
6449    False
6450    False
6451    False
6452    False
6453    False
Name: number_of_fires, Length: 6454, dtype: bool

In [30]:
df[df['number_of_fires'].isnull()].head(10)

Unnamed: 0,year,month,state,number_of_fires,date
68,2006,Abril,Acre,,1/1/2006
110,2008,Junho,Acre,,1/1/2008
127,2005,Julho,Acre,,1/1/2005
206,2004,Novembro,Acre,,1/1/2004
217,2015,Novembro,Acre,,1/1/2015
444,2002,Novembro,alagoas,,1/1/2002
522,2001,Março,Amapa,,1/1/2001
550,2009,Abril,Amapa,,1/1/2009
614,2013,Julho,Amapa,,1/1/2013
642,2001,Setembro,Amapa,,1/1/2001


In [31]:
# Using fillna with zeros

df['number_of_fires'].fillna(0).head()

0    0
1    0
2    0
3    0
4    0
Name: number_of_fires, dtype: object

In [32]:
# Let's try back filling
df['number_of_fires'].fillna(method='ffill').head(70)

0     0
1     0
2     0
3     0
4     0
     ..
65    1
66    2
67    1
68    1
69    0
Name: number_of_fires, Length: 70, dtype: object

In [34]:
# View index 444 to see how it changes
# Homework, change 444 using ffill and backfill to see how it changes
f=df['number_of_fires'].fillna(method='backfill')
f.iloc[444]

'17'

In [35]:
df.iloc[445]

year                   2003
month              Novembro
state               alagoas
number_of_fires          17
date               1/1/2003
Name: 445, dtype: object

In [36]:
# let's make the assumption that blank values are 0 fires

# let's get back our copy of our original pre-processed datafrmae
df = df_copy

# replace all missing or NaN values with 0
df['number_of_fires'] = df['number_of_fires'].fillna(0)

In [37]:
df['month'].unique()

array(['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 'Julho',
       'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro'],
      dtype=object)

In [38]:
# Let's convert our Portuguese month names to English

month_translations = {'Janeiro': 'January',
'Fevereiro': 'February',
'Março': 'March',
'Abril': 'April',
'Maio': 'May',
'Junho': 'June',
'Julho': 'July',
'Agosto': 'August',
'Setembro': 'September',
'Outubro': 'October',
'Novembro': 'November',
'Dezembro': 'December'}

df["month"] = df["month"].map(month_translations)
df.head()

Unnamed: 0,year,month,state,number_of_fires,date
0,1998,January,Acre,0,1/1/1998
1,1999,January,Acre,0,1/1/1999
2,2000,January,Acre,0,1/1/2000
3,2001,January,Acre,0,1/1/2001
4,2002,January,Acre,0,1/1/2002


In [39]:
df['state'] = df['state'].str.title()
df['state'].unique()

array(['Acre', 'Alagoas', 'Amapa', 'Amazonas', 'Bahia', 'Ceara',
       'Distrito Federal', 'Espirito Santo', 'Goias', 'Maranhao',
       'Mato Grosso', 'Minas Gerais', 'Pará', 'Paraiba', 'Pernambuco',
       'Piau', 'Rio', 'Rondonia', 'Roraima', 'Santa Catarina',
       'Sao Paulo', 'Sergipe', 'Tocantins'], dtype=object)

In [40]:
base= {'Acre' : 'accra',
       'Alagoas' : 'Anloga',
       'Amapa' : 'Apam',
       'Amazonas' : 'Amazon',
       'Bahia' : 'Briwa',
       'Ceara' : 'Setwi',
       'Distrito Federal' : 'Denhrah',
       'Espirito Santo' : 'Santa Maria',
       'Goias' : 'Gome',
       'Maranhao' : 'Mallam',
       'Mato Grosso' : 'goosu',
       'Minas Gerais' : 'Ganikofe',
       'Pará' : 'paraku',
       'Paraiba' : 'papaye',
       'Pernambuco' : 'Peppiase',
       'Pernambuco' : 'kantamato',
       'Piau' : 'perdu',
       'Rio' : 'Ricco',
       'Rondonia' : 'Ricchoco',
       'Roraima' : 'Somalia',
       'Santa Catarina' : 'Santase',
       'Sao Paulo' : 'Paloma',
       'Sergipe' : 'Sagakofe',
       'Tocantins' : 'Takwa'}



df["state"] = df["state"].map(base)
df.head()


Unnamed: 0,year,month,state,number_of_fires,date
0,1998,January,accra,0,1/1/1998
1,1999,January,accra,0,1/1/1999
2,2000,January,accra,0,1/1/2000
3,2001,January,accra,0,1/1/2001
4,2002,January,accra,0,1/1/2002


In [41]:
df['state'] = df['state'].str.title()
df['state'].unique()

array(['Accra', 'Anloga', 'Apam', 'Amazon', 'Briwa', 'Setwi', 'Denhrah',
       'Santa Maria', 'Gome', 'Mallam', 'Goosu', 'Ganikofe', 'Paraku',
       'Papaye', 'Kantamato', 'Perdu', 'Ricco', 'Ricchoco', 'Somalia',
       'Santase', 'Paloma', 'Sagakofe', 'Takwa'], dtype=object)