# Libraries

In [2]:
import pandas as pd

# Reading Data

In [17]:
#Creating key-value pair to use to populate the dataframe for testing

data = {
    'Month':pd.Series(['January', 'February','March','April','May','June','July','August','September','October','November','December']),
    'Rainfall [cm]':pd.Series([1.65,1.25,1.94,2.75,3.14,3.65,5.05,1.50,1.33,0.07,0.5,2.30]),
} #Sample canned data
type(data)

dict

In [19]:
#Create a dataframe using the static data

df = pd.DataFrame(data)
df

Unnamed: 0,Month,Rainfall [cm]
0,January,1.65
1,February,1.25
2,March,1.94
3,April,2.75
4,May,3.14
5,June,3.65
6,July,5.05
7,August,1.5
8,September,1.33
9,October,0.07


In [20]:
#Creating DataFrame using data from .csv file

df = pd.read_csv('data.csv')
df

Unnamed: 0,Month,Rainfall [cm],Temperature [F]
0,January,1.65,3.0
1,February,1.25,10.0
2,March,1.94,15.0
3,April,2.75,20.0
4,May,2.75,25.0
5,June,3.645,24.0
6,July,5.5,30.0
7,August,1.0,1.0
8,September,1.3,33.0
9,October,,


In [21]:
#Creating DataFrame reading data from .json file
df = pd.read_json('data.json')
df

Unnamed: 0,Month,Rainfall,Temperature
0,January,1.65,3.0
1,February,1.25,10.0
2,March,1.94,15.0
3,April,2.75,20.0
4,May,2.75,25.0
5,June,3.645,24.0
6,July,5.5,30.0
7,August,1.0,1.0
8,September,1.3,33.0
9,October,,


# Cleaning Data

In [22]:
#filling up missing data with zeroes
df_zero = df.fillna(0)
df_zero

Unnamed: 0,Month,Rainfall,Temperature
0,January,1.65,3.0
1,February,1.25,10.0
2,March,1.94,15.0
3,April,2.75,20.0
4,May,2.75,25.0
5,June,3.645,24.0
6,July,5.5,30.0
7,August,1.0,1.0
8,September,1.3,33.0
9,October,0.0,0.0


In [23]:
df

Unnamed: 0,Month,Rainfall,Temperature
0,January,1.65,3.0
1,February,1.25,10.0
2,March,1.94,15.0
3,April,2.75,20.0
4,May,2.75,25.0
5,June,3.645,24.0
6,July,5.5,30.0
7,August,1.0,1.0
8,September,1.3,33.0
9,October,,


In [31]:
df.fillna(0,inplace = True) # to permanent change the df values , filling up missing data with zeros
df

Unnamed: 0,Month,Rainfall,Temperature
0,January,1.65,3.0
1,February,1.25,10.0
2,March,1.94,15.0
3,April,2.75,20.0
4,May,2.75,25.0
5,June,3.645,24.0
6,July,5.5,30.0
7,August,1.0,1.0
8,September,1.3,33.0
9,October,0.0,0.0


In [37]:
df = pd.read_json('data.json')
df

Unnamed: 0,Month,Rainfall,Temperature
0,January,1.65,3.0
1,February,1.25,10.0
2,March,1.94,15.0
3,April,2.75,20.0
4,May,2.75,25.0
5,June,3.645,24.0
6,July,5.5,30.0
7,August,1.0,1.0
8,September,1.3,33.0
9,October,,


In [42]:
#zeros skew the analysis, so instead filling up with zeros we want to remove the rows with missing data

df_clean = df.dropna(0)
df_clean

Unnamed: 0,Month,Rainfall,Temperature
0,January,1.65,3.0
1,February,1.25,10.0
2,March,1.94,15.0
3,April,2.75,20.0
4,May,2.75,25.0
5,June,3.645,24.0
6,July,5.5,30.0
7,August,1.0,1.0
8,September,1.3,33.0
10,November,0.5,32.0


In [47]:
# create a count of all rows with missing data
count =0
for index,row in df.iterrows():
    if any(row.isnull()):
        count = count +1
        
print("The number of missing rows (NaNs) is",count, ".")


The number of missing rows (NaNs) is 1 .


In [50]:
# for permanent drop
df.dropna(0,inplace = True)
df

Unnamed: 0,Month,Rainfall,Temperature
0,January,1.65,3.0
1,February,1.25,10.0
2,March,1.94,15.0
3,April,2.75,20.0
4,May,2.75,25.0
5,June,3.645,24.0
6,July,5.5,30.0
7,August,1.0,1.0
8,September,1.3,33.0
10,November,0.5,32.0


# Performing Statistical Analysis

In [51]:
df.describe()

Unnamed: 0,Rainfall,Temperature
count,11.0,11.0
mean,2.235,17.754545
std,1.413936,12.193553
min,0.5,1.0
25%,1.275,6.5
50%,1.94,20.0
75%,2.75,27.5
max,5.5,33.0


In [52]:
df.mean()

Rainfall        2.235000
Temperature    17.754545
dtype: float64

In [53]:
df.std()

Rainfall        1.413936
Temperature    12.193553
dtype: float64

In [54]:
df.median()

Rainfall        1.94
Temperature    20.00
dtype: float64

# Data Subset

In [58]:
#indexing single column
df['Temperature']

0      3.0
1     10.0
2     15.0
3     20.0
4     25.0
5     24.0
6     30.0
7      1.0
8     33.0
10    32.0
11     2.3
Name: Temperature, dtype: float64

In [59]:
df['Rainfall']

0     1.650
1     1.250
2     1.940
3     2.750
4     2.750
5     3.645
6     5.500
7     1.000
8     1.300
10    0.500
11    2.300
Name: Rainfall, dtype: float64

In [60]:
#indexing multiple columns 
df[['Month','Temperature']]

Unnamed: 0,Month,Temperature
0,January,3.0
1,February,10.0
2,March,15.0
3,April,20.0
4,May,25.0
5,June,24.0
6,July,30.0
7,August,1.0
8,September,33.0
10,November,32.0


In [61]:
#select a certain row number using iloc
df.iloc[2]

Month          March
Rainfall        1.94
Temperature     15.0
Name: 2, dtype: object

In [65]:
df.iloc[-1] #last row

Month          December
Rainfall            2.3
Temperature         2.3
Name: 11, dtype: object

In [70]:
#print the rainfall of the first few months (mean)
rainfall = df['Rainfall'][0:3]
rainfall

0    1.65
1    1.25
2    1.94
Name: Rainfall, dtype: float64

In [71]:
print("\nThe mean rainfall : ",rainfall.mean())


The mean rainfall :  1.6133333333333333


In [73]:
#Choosing a row by value
index = df['Month']
df.set_index(index,inplace=True)
df.loc['March']

Month          March
Rainfall        1.94
Temperature     15.0
Name: March, dtype: object

In [74]:
df.loc['July']

Month          July
Rainfall        5.5
Temperature    30.0
Name: July, dtype: object