# Pandas and Matplotlib tutorial

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Let's start with Pandas Series

In [2]:
# Creating a series
a = np.array(np.random.randint(1,100,10))
s = pd.Series(a)
print(s)
print(s.values)

0     2
1    73
2    21
3    19
4    88
5    21
6    86
7    37
8    65
9    80
dtype: int64
[ 2 73 21 19 88 21 86 37 65 80]


In [3]:
# Giving Custom Indexes
s = pd.Series(a,index =np.random.randint(400,500,10))
print(s)

408     2
499    73
477    21
441    19
423    88
469    21
444    86
475    37
483    65
402    80
dtype: int64


In [4]:
# Creating a Series from a dictionary
dictionary = {'h':456,'g':392,'c':980}
print(pd.Series(dictionary))

c    980
g    392
h    456
dtype: int64


In [9]:
# Accessing elemnets in a Series
# Using indexes and using values

pop_dict = {'Uttar Pradesh': 38332521,'Karnataka': 26448193,'Haryana': 19651127}
pop = pd.Series(pop_dict)
print(pop['Karnataka'])
print(pop[1])

{'Karnataka': 26448193, 'Uttar Pradesh': 38332521, 'Haryana': 19651127}
26448193
26448193


In [None]:
# Retrieving a range

s = pd.Series(range(10), index = [x for x in 'abcdefghij'])

# Retrieve the first 3 elements

print(s[:3])
print(s[:'c'])

# Retrieve the last element
print(s[-1:])

# Accessing everything but the last element
print(s[:-1])

Let's work with Iris Dataset

In [None]:
# Load iris dataset
# filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
# data = pd.read_csv(filename, sep=',', header=None)
data = pd.read_csv('Iris.csv')

In [None]:
# Basic dataset statistics
data.describe()

In [None]:
# Info about datatypes and missing values
data.info()

In [None]:
# Removing null values
data.dropna().head()

In [None]:
# Get column names of data
cols = list(data.columns)
print(cols)

In [None]:
# slicing data frames
n = 10
first_n = data[:n]
print(first_n)

In [None]:
# slicing based on index of columns
# returns nth row 
data.iloc[n-1]

In [None]:
# returns range of rows and columns
data.iloc[2:10, 1:3]

In [None]:
# Add a new column 
data['newCol'] = 0
print(data.head())
# Drop the new column
data.drop('newCol', axis='columns', inplace=False)
data.head()

In [None]:
# Applying a function on the data

sepal_lengths = data['SepalLengthCm'] 
mean = np.mean(sepal_lengths)

# Define function 
do = lambda x : x - mean

mean_sepal = data['SepalLengthCm'].apply(do)
print(mean_sepal.head())

In [None]:
# sorting data
data.sort_values(by=['PetalLengthCm'], ascending=True, inplace=False).head()

In [None]:
# Finding the number of data points in each class
data['Species'].value_counts()

In [None]:
# Cumulative sum of the value counts
data['Species'].value_counts().cumsum()

In [None]:
# dropping duplicate values
data.drop_duplicates().head()

In [None]:
# removing some columns / row from dataset
data.drop('PetalLengthCm', axis='columns', inplace=False).head()

Pivot tables

In [None]:
# Creating pivot tables with np.sum
data.pivot_table(values='SepalLengthCm', index='Species', aggfunc=np.sum)

In [None]:
# Create pivot table with np.mean
data.pivot_table(values='SepalLengthCm', index='Species', aggfunc=np.mean)

# Visualizations

In [None]:
ax = data[data.Species=='Iris-setosa'].plot.scatter(x='SepalLengthCm', y='SepalWidthCm', color='red', label='Setosa')
data[data.Species=='Iris-versicolor'].plot.scatter(x='SepalLengthCm', y='SepalWidthCm', color='green', label='Versicolor', ax = ax)
data[data.Species=='Iris-virginica'].plot.scatter(x='SepalLengthCm', y='SepalWidthCm', color='blue', label='Virginica', ax = ax)

ax.set_title("Scatter plot between sepal length and sepal width")
plt.show()

# Exercises

1. Plot Histograms of each species (target label) versus each column name
2. Get unique values of each column
3. Filter rows based on column values
4. Scatter plot between Petal length and Petal width for each Species
5. Make a new column with sum of the other columns
6. Save a dataframe as a csv, Excel file