<a href="https://colab.research.google.com/github/CelesTech03/Data-Analytics-Winter-2022/blob/main/Python/Day_5_Intro_to_Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Intro to Pandas

In [1]:
import numpy as np
import pandas as pd
from numpy.random import randn
np.random.seed(123)

In [2]:
# Series in Pandas have a labeled index and a datapoint.  Series are a part of DataFrames.
# We can create a sample Series in Python:
labels = ['a', 'b','c', 'd']
my_data = [10, 20, 30, 40]
arr = np.array(my_data)
d = {'a':10, 'b':20, 'c':30, 'd':40}

In [3]:
d

{'a': 10, 'b': 20, 'c': 30, 'd': 40}

In [4]:
pd.Series(data=my_data, index=labels)

a    10
b    20
c    30
d    40
dtype: int64

In [5]:
# Create a series of ten cities and years
cities = ['NYC', 'Los Angeles', 'Chicago', 'San Diego', 
          'San Francisco', 'Orlando', 'Miami', 'Philly', 'Tokyo', 'Paris']
years = [1964, 2000, 1985, 2001, 2004,
         2021, 2003, 1965, 1954, 1967]
pd.Series(data=years, index=cities)

NYC              1964
Los Angeles      2000
Chicago          1985
San Diego        2001
San Francisco    2004
Orlando          2021
Miami            2003
Philly           1965
Tokyo            1954
Paris            1967
dtype: int64

In [6]:
pd.Series(data=cities, index=years)

1964              NYC
2000      Los Angeles
1985          Chicago
2001        San Diego
2004    San Francisco
2021          Orlando
2003            Miami
1965           Philly
1954            Tokyo
1967            Paris
dtype: object

#Data Frames

In [7]:
# Creates a dataframe
# Dataframes are made up of multiple series (W, X, Y, Z).
data = pd.DataFrame(randn(5, 4), ['A', 'B', 'C', 'D', 'E'], 
                    ['W', 'X', 'Y', 'Z'])

In [8]:
data

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
C,1.265936,-0.86674,-0.678886,-0.094709
D,1.49139,-0.638902,-0.443982,-0.434351
E,2.20593,2.186786,1.004054,0.386186


In [9]:
df_1 = data['Y']
df_1

A    0.282978
B   -2.426679
C   -0.678886
D   -0.443982
E    1.004054
Name: Y, dtype: float64

In [10]:
data['W']

A   -1.085631
B   -0.578600
C    1.265936
D    1.491390
E    2.205930
Name: W, dtype: float64

In [11]:
# To select multiple columns, we pass in a list of column names
data[['W','X']]

Unnamed: 0,W,X
A,-1.085631,0.997345
B,-0.5786,1.651437
C,1.265936,-0.86674
D,1.49139,-0.638902
E,2.20593,2.186786


In [12]:
# We can create new columns in our data frame as well – as if they already exist
# df.drop(‘new’, axis=1, inplace = True) will drop the column from the dataframe.
# Axis = 1 tells Python to look at the columns. Inplace = True tells Python to modify the exisiting dataframe 
data['new'] = data['W'] + data['Y']
data

Unnamed: 0,W,X,Y,Z,new
A,-1.085631,0.997345,0.282978,-1.506295,-0.802652
B,-0.5786,1.651437,-2.426679,-0.428913,-3.005279
C,1.265936,-0.86674,-0.678886,-0.094709,0.58705
D,1.49139,-0.638902,-0.443982,-0.434351,1.047408
E,2.20593,2.186786,1.004054,0.386186,3.209984


In [13]:
# We can create new dataframes by assigning our subset to a new variable, for instance:
data2 = data # Our new dataframe (df2) will only have those columns.[['W','X']]

In [14]:
data.drop('E', axis = 0)

Unnamed: 0,W,X,Y,Z,new
A,-1.085631,0.997345,0.282978,-1.506295,-0.802652
B,-0.5786,1.651437,-2.426679,-0.428913,-3.005279
C,1.265936,-0.86674,-0.678886,-0.094709,0.58705
D,1.49139,-0.638902,-0.443982,-0.434351,1.047408


In [15]:
# We use loc and iloc to select rows
data.loc['C']

W      1.265936
X     -0.866740
Y     -0.678886
Z     -0.094709
new    0.587050
Name: C, dtype: float64

In [16]:
data.iloc[2]

W      1.265936
X     -0.866740
Y     -0.678886
Z     -0.094709
new    0.587050
Name: C, dtype: float64

In [17]:
data > 0

Unnamed: 0,W,X,Y,Z,new
A,False,True,True,False,False
B,False,True,False,False,False
C,True,False,False,False,True
D,True,False,False,False,True
E,True,True,True,True,True


In [18]:
# Often times, you wont filter the entire dataframe -  You might want to filter based on a specific column. 
data[data['Z'] < 0]

Unnamed: 0,W,X,Y,Z,new
A,-1.085631,0.997345,0.282978,-1.506295,-0.802652
B,-0.5786,1.651437,-2.426679,-0.428913,-3.005279
C,1.265936,-0.86674,-0.678886,-0.094709,0.58705
D,1.49139,-0.638902,-0.443982,-0.434351,1.047408


In [19]:
data[data['W'] > 0]

Unnamed: 0,W,X,Y,Z,new
C,1.265936,-0.86674,-0.678886,-0.094709,0.58705
D,1.49139,-0.638902,-0.443982,-0.434351,1.047408
E,2.20593,2.186786,1.004054,0.386186,3.209984


In [20]:
# RENAMING COLUMNS
# Sometimes data is messy – column names don’t make sense, or are even missing.  
# Data professionals will very often need to work on modifying their data
data = data.rename(columns = {'Unnamed : 0' : 'newName1', 'oldName2' : 'newName2'})
# Or rename the existing DataFrame (rather than creating a copy)
data.rename(columns = {'oldName1' : 'newName1', 'oldName2' : 'newName2'}, inplace = True)