In [1]:
# Pandas is an Open Source Data Analysis library for Python. 

In [2]:
# Pandas has two main data structures: Series (One dimensional) and DataFrame (Two Dimensional).

In [7]:
#importing pandas.
import pandas as pd

# Pandas Series:

In [4]:
#Pandas series is just like Python list. But Series is associated with the index.

In [5]:
#Ex:

In [9]:
s1 = pd.Series([1,2,3,4,5])
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [10]:
#Series automatically gives indexes.

In [11]:
s1[1]

2

In [12]:
#To provide custom indexes:

In [13]:
s1 = pd.Series({'A':1,'B':2,'C':3,'D':4,'E':5})
s1
#Here dictionary is passed. Keys become indexes.

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [15]:
s1['B']

2

In [16]:
#to check all indexes:

In [19]:
s1.index

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [21]:
s1.values

array([1, 2, 3, 4, 5], dtype=int64)

In [22]:
s1.index = [0,1,2,3,4]

In [23]:
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [24]:
s1.index = ['A','B','C','D','E']

In [25]:
s1

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [26]:
# Creating pandas series using numpy arrays

In [27]:
import numpy as np

In [60]:
npkey = np.array(['A','B','C','D','E'])
npvalue = np.array([1,2,3,4,5])

In [61]:
series2 = pd.Series(data=npvalue, index = npkey)

In [62]:
series2

A    1
B    2
C    3
D    4
E    5
dtype: int32

In [63]:
#combining two data frames

In [64]:
s3 = pd.Series({'F':6,'G':8})

In [65]:
series2 = series2.append(s3)
series2

A    1
B    2
C    3
D    4
E    5
F    6
G    8
dtype: int64

In [66]:
#to delete and item
series2.drop(['G'],inplace=True)
#inplace True is uded to make delete confirm

In [67]:
series2

A    1
B    2
C    3
D    4
E    5
F    6
dtype: int64

In [68]:
#multiply using numpy

In [72]:
series2 = np.multiply(series2,3)
series2

A     9
B    18
C    27
D    36
E    45
F    54
dtype: int64

In [73]:
series2 = np.divide(series2,2)
series2

A     4.5
B     9.0
C    13.5
D    18.0
E    22.5
F    27.0
dtype: float64

# Pandas DataFrame:

In [75]:
# Pandas Dataframe is a 2 dimensional labeled data type (columns and row data like 
# relational data base systems). Columns can have different data types.

In [80]:
#importing data using URL:
iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')

In [82]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [84]:
#importing data from a local file
iris = pd.read_csv('iris.csv')

In [85]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [101]:
#data can be imported from variety of data sources like excel,csv, MySQL, etc.,

In [102]:
# creating data frame using dictionary:

In [103]:
my_dict = { 
     'Name' : ["A", "B", "C", "D", "E","F"],
     'Height' : [20,25, 36, 5, 38, 26],
     'Weight': [75, 60, 86, 56, 45, 56]
}

In [105]:
df = pd.DataFrame(my_dict)
df

Unnamed: 0,Height,Name,Weight
0,20,A,75
1,25,B,60
2,36,C,86
3,5,D,56
4,38,E,45
5,26,F,56


In [106]:
#python implements dictionary as hash and doesn’t guarantee to preserve the sequence.

In [110]:
#Giving index
df = pd.DataFrame(my_dict,index=[1,2,3,4,5,6])
df

Unnamed: 0,Height,Name,Weight
1,20,A,75
2,25,B,60
3,36,C,86
4,5,D,56
5,38,E,45
6,26,F,56


In [111]:
#Giving index .. need not be numbers
df = pd.DataFrame(my_dict,index=[1,2,"DEF",4.0,5,"ABC"])
df

Unnamed: 0,Height,Name,Weight
1,20,A,75
2,25,B,60
DEF,36,C,86
4.0,5,D,56
5,38,E,45
ABC,26,F,56


In [113]:
#Each column will be of same type
df.dtypes

Height     int64
Name      object
Weight     int64
dtype: object

In [115]:
iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')

In [116]:
#first 5 rows:

In [117]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [119]:
#last five rows:
iris.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [120]:
iris.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [121]:
#column names:

In [122]:
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [125]:
#mean value of a column
iris.sepal_length.mean()

5.843333333333335

In [133]:
#or
iris['sepal_length'].mean()

5.843333333333335

In [134]:
#max value of a column
iris.sepal_length.max()

7.9000000000000004

In [135]:
#min value of a column
iris.sepal_length.min()

4.2999999999999998

In [138]:
#Find unique values in a column:
iris.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [139]:
#selecting certain columns only

In [142]:
df2 = pd.DataFrame(iris, columns=["sepal_length","sepal_width", "species"])
df2.head()

Unnamed: 0,sepal_length,sepal_width,species
0,5.1,3.5,setosa
1,4.9,3.0,setosa
2,4.7,3.2,setosa
3,4.6,3.1,setosa
4,5.0,3.6,setosa


In [144]:
#delete a column   (1 is to delete column, 0 is to delete row)
df2 = df2.drop('sepal_width',1)
df2.head()

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa


In [145]:
#delete a row whose index equal to 2
df2 = df2.drop(2,0)
df2.head()

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
3,4.6,setosa
4,5.0,setosa
5,5.4,setosa


In [152]:
df2 = pd.DataFrame(iris, columns=["sepal_length","sepal_width", "species"])
df2.head()

Unnamed: 0,sepal_length,sepal_width,species
0,5.1,3.5,setosa
1,4.9,3.0,setosa
2,4.7,3.2,setosa
3,4.6,3.1,setosa
4,5.0,3.6,setosa


In [153]:
#drop multiple columns:
df2 = df2.drop(['sepal_length','sepal_width'],1)

In [154]:
df2.head()

Unnamed: 0,species
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa


In [155]:
#drop multiple rows:
df2 = df2.drop([0,1,2],0)
df2.head()

Unnamed: 0,species
3,setosa
4,setosa
5,setosa
6,setosa
7,setosa


In [156]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [158]:
#multiplication:
iris.sepal_length = iris.sepal_length*10
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,51.0,3.5,1.4,0.2,setosa
1,49.0,3.0,1.4,0.2,setosa
2,47.0,3.2,1.3,0.2,setosa
3,46.0,3.1,1.5,0.2,setosa
4,50.0,3.6,1.4,0.2,setosa


In [159]:
#division:
iris.sepal_length = iris.sepal_length/10
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [160]:
#addition:
iris.sepal_length = iris.sepal_length+10
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,15.1,3.5,1.4,0.2,setosa
1,14.9,3.0,1.4,0.2,setosa
2,14.7,3.2,1.3,0.2,setosa
3,14.6,3.1,1.5,0.2,setosa
4,15.0,3.6,1.4,0.2,setosa


In [161]:
iris.sepal_length = iris.sepal_length-10
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
