# What is Data Science?
Data science or data analytics is a process of analyzing large set of data points to get answers on questions related to that data set.
- Pandas is a python module that makes data science easy and effective.
- Process of cleaning messy data is called *data munging* or *data wrangling*.

In [3]:
import pandas as pd
df = pd.read_csv(r"nyc_weather.csv")
print(f"Printing the whole dataset:")
df

Printing the whole dataset:


Unnamed: 0,EST,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
0,1/1/2016,38,23,52,30.03,10,8.0,0,5,,281
1,1/2/2016,36,18,46,30.02,10,7.0,0,3,,275
2,1/3/2016,40,21,47,29.86,10,8.0,0,1,,277
3,1/4/2016,25,9,44,30.05,10,9.0,0,3,,345
4,1/5/2016,20,-3,41,30.57,10,5.0,0,0,,333
5,1/6/2016,33,4,35,30.5,10,4.0,0,0,,259
6,1/7/2016,39,11,33,30.28,10,2.0,0,3,,293
7,1/8/2016,39,29,64,30.2,10,4.0,0,8,,79
8,1/9/2016,44,38,77,30.16,9,8.0,T,8,Rain,76
9,1/10/2016,50,46,71,29.59,4,,1.8,7,Rain,109


In [4]:
print(f"Max temperature is: {df['Temperature'].max()}")

print(f"Date on which it rained:\n{df['EST'][df['Events']=='Rain']}")

# Data cleaning (blanks become 0)
df.fillna(0, inplace=True)

print(f"Average wind speed: {df['WindSpeedMPH'].mean()}")

Max temperature is: 50
Date on which it rained:
8      1/9/2016
9     1/10/2016
15    1/16/2016
26    1/27/2016
Name: EST, dtype: object
Average wind speed: 6.225806451612903


# Dataframe
Dataframe is a main object in Pandas. It is used to represent data with rows and columns (tabular or excel spreadsheet like data)

In [5]:
weatherData = {
    'day': ['1/1/2017','1/2/2017','1/3/2017','1/4/2017','1/5/2017','1/6/2017'],
    'temperature': [32,35,28,24,32,31],
    'windspeed': [6,7,2,7,4,2],
    'event': ['Rain', 'Sunny', 'Snow','Snow','Rain', 'Sunny']
}
df = pd.DataFrame(weatherData)
# print(df)
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [42]:
df = pd.read_csv("weather_data.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [43]:
rows, columns = df.shape
print(f"Shape: {rows}x{columns}")

Shape: 6x4


# Rows

In [44]:
print(df.head(3))   # By default df.head() = df.head(5) first five records
print(df.tail(2))   # By default df.tail() = df.taid(5) last five records
print(df[1:3])      # [1:3] second and third (indexing)

        day  temperature  windspeed  event
0  1/1/2017           32          6   Rain
1  1/2/2017           35          7  Sunny
2  1/3/2017           28          2   Snow
        day  temperature  windspeed  event
4  1/5/2017           32          4   Rain
5  1/6/2017           31          2  Sunny
        day  temperature  windspeed  event
1  1/2/2017           35          7  Sunny
2  1/3/2017           28          2   Snow


# Columns

In [45]:
print(df.columns, "\n")
print(df['day'], "\n") # or df.day
print(type(df.day), "\n")
print(df[['day', 'temperature']])

Index(['day', 'temperature', 'windspeed', 'event'], dtype='object') 

0    1/1/2017
1    1/2/2017
2    1/3/2017
3    1/4/2017
4    1/5/2017
5    1/6/2017
Name: day, dtype: object 

<class 'pandas.core.series.Series'> 

        day  temperature
0  1/1/2017           32
1  1/2/2017           35
2  1/3/2017           28
3  1/4/2017           24
4  1/5/2017           32
5  1/6/2017           31


# Operations

In [46]:
print(f"Max temperature: {df.temperature.max()}")
print(f"When temperature >=32:\n{df[df['temperature']>=32]}\n")
print(f"Dates when temperature was max:\n{df['day'][df['temperature'] == df['temperature'].max()]}")
print(f"Details when temperature was max:\n{df[df['temperature'] == df['temperature'].max()]}\n")
print(f"Standard deviation: {df.temperature.std()}\n")
print(f"Max: {df['event'].max()}")
print(f"Decription:\n{df.describe()}")

Max temperature: 35
When temperature >=32:
        day  temperature  windspeed  event
0  1/1/2017           32          6   Rain
1  1/2/2017           35          7  Sunny
4  1/5/2017           32          4   Rain

Dates when temperature was max:
1    1/2/2017
Name: day, dtype: object
Details when temperature was max:
        day  temperature  windspeed  event
1  1/2/2017           35          7  Sunny

Standard deviation: 3.8297084310253524

Max: Sunny
Decription:
       temperature  windspeed
count     6.000000   6.000000
mean     30.333333   4.666667
std       3.829708   2.338090
min      24.000000   2.000000
25%      28.750000   2.500000
50%      31.500000   5.000000
75%      32.000000   6.750000
max      35.000000   7.000000


# Set Index

In [54]:
print(f"{df}\n")
df1 = df.set_index('day')
print(f"{df1}\n")
print(f"{df1.index}\n")
print(f"{df1.loc['1/2/2017']}\n")

        day  temperature  windspeed  event
0  1/1/2017           32          6   Rain
1  1/2/2017           35          7  Sunny
2  1/3/2017           28          2   Snow
3  1/4/2017           24          7   Snow
4  1/5/2017           32          4   Rain
5  1/6/2017           31          2  Sunny

          temperature  windspeed  event
day                                    
1/1/2017           32          6   Rain
1/2/2017           35          7  Sunny
1/3/2017           28          2   Snow
1/4/2017           24          7   Snow
1/5/2017           32          4   Rain
1/6/2017           31          2  Sunny

Index(['1/1/2017', '1/2/2017', '1/3/2017', '1/4/2017', '1/5/2017', '1/6/2017'], dtype='object', name='day')

temperature       35
windspeed          7
event          Sunny
Name: 1/2/2017, dtype: object



In [63]:
df1.reset_index(inplace=True)
print(df1,"\n")
df1.set_index('event', inplace=True)
print(df1, "\n")
print(df1.loc['Snow'])

   event  index       day  temperature  windspeed
0   Rain      0  1/1/2017           32          6
1  Sunny      1  1/2/2017           35          7
2   Snow      2  1/3/2017           28          2
3   Snow      3  1/4/2017           24          7
4   Rain      4  1/5/2017           32          4
5  Sunny      5  1/6/2017           31          2 

       index       day  temperature  windspeed
event                                         
Rain       0  1/1/2017           32          6
Sunny      1  1/2/2017           35          7
Snow       2  1/3/2017           28          2
Snow       3  1/4/2017           24          7
Rain       4  1/5/2017           32          4
Sunny      5  1/6/2017           31          2 

       index       day  temperature  windspeed
event                                         
Snow       2  1/3/2017           28          2
Snow       3  1/4/2017           24          7


# Different ways to create dataframe

## Using CSV

In [64]:
df = pd.read_csv(r"weather_data.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


## Using Excel

In [65]:
df=pd.read_excel("weather_data.xlsx","Sheet1")
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,35,7,Sunny
2,2017-01-03,28,2,Snow
3,2017-01-04,24,7,Snow
4,2017-01-05,32,4,Rain
5,2017-01-06,31,2,Sunny


## Using Dictionary

In [66]:
import pandas as pd
weather_data = {
    'day': ['1/1/2017','1/2/2017','1/3/2017'],
    'temperature': [32,35,28],
    'windspeed': [6,7,2],
    'event': ['Rain', 'Sunny', 'Snow']
}
df = pd.DataFrame(weather_data)
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow


## Using list of tuples

In [67]:
weather_data = [
    ('1/1/2017',32,6,'Rain'),
    ('1/2/2017',35,7,'Sunny'),
    ('1/3/2017',28,2,'Snow')
]
df = pd.DataFrame(data=weather_data, columns=['day','temperature','windspeed','event'])
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow


## Using list of dictionary

In [68]:
weather_data = [
    {'day': '1/1/2017', 'temperature': 32, 'windspeed': 6, 'event': 'Rain'},
    {'day': '1/2/2017', 'temperature': 35, 'windspeed': 7, 'event': 'Sunny'},
    {'day': '1/3/2017', 'temperature': 28, 'windspeed': 2, 'event': 'Snow'},
    
]
df = pd.DataFrame(data=weather_data, columns=['day','temperature','windspeed','event'])
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
