In [2]:
# The pandas library essentially has three data structures:
# 1. Series
# 2. DataFrame
# 3. Panel

In [4]:
import pandas as pd
import numpy as np
pd.Series(np.random.randn(5))
# The random.randn parameter is part of the NumPy package and it
# generates random numbers. The series function creates a pandas series
# that consists of an index, which is the first column, and the second
# column consists of random values. At the bottom of the output is the
# datatype of the series.

0   -2.819768
1    0.126497
2    2.023404
3    1.647794
4   -0.674231
dtype: float64

In [5]:
# The index of the series can be customized by calling the following:
pd.Series(np.random.randn(5), index = ['a', 'b', 'c', 'd', 'e'])

a   -0.262882
b   -0.131162
c    1.703044
d   -0.700702
e    0.356426
dtype: float64

In [6]:
# A series can be derived from a Python dict too:
d = {'A':10, 'B':20, 'C':30}
pd.Series(d)

A    10
B    20
C    30
dtype: int64

In [7]:
# DataFrame is a 2D data structure with columns that can be of different
# datatypes. It can be seen as a table. A DataFrame can be formed from
# the following data structures:
# A NumPy array
# Lists
# Dicts
# Series
# A 2D NumPy array

In [8]:
# A DataFrame can be created from a dict of series by calling the following
# commands:
d = {'c1': pd.Series(['A', 'B', 'C']), 'c2': pd.Series([1, 2, 3, 4])}
df = pd.DataFrame(d)
df

Unnamed: 0,c1,c2
0,A,1
1,B,2
2,C,3
3,,4


In [9]:
# The DataFrame can be created using a dict of lists too:
d = {'c1': ['A', 'B', 'C', 'D'], 'c2': [1, 2.0, 3.0, 4.0]}
df = pd.DataFrame(d)
df

Unnamed: 0,c1,c2
0,A,1.0
1,B,2.0
2,C,3.0
3,D,4.0


In [13]:
# A Panel is a data structure that handles 3D data. The following command
# is an example of panel data:
d = {'Item1' : pd.DataFrame(np.random.randn(4,3)), 'Item2' : pd.DataFrame(np.random.randn(4, 2))}
pd.Panel(d)
# The preceding command shows that there are 2 DataFrames
# represented by two items. There are four rows represented by four major
# axes and three columns represented by three minor axes.

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 4 (major_axis) x 3 (minor_axis)
Items axis: Item1 to Item2
Major_axis axis: 0 to 3
Minor_axis axis: 0 to 2

In [14]:
# To write a data to the .csv file, the following to_csv function can be used:
d = {'c1': pd.Series(['A', 'B', 'C']), 'c2': pd.Series([1, 2., 3., 4.])}
df = pd.DataFrame(d)
df.to_csv('sample_data.csv')

In [15]:
# To read the data from a JSON file, Python's standard json package can
# be used. The following commands help in reading the file:
# import json
# json_data =
# open('Data/Student_Weight_Status_Category_Reporting_Results__Beginning_2010.json')
# data = json.load(json_data)
# json_data.close()
# In the preceding command, the open() function opens a connection to the
# file. The json.load() function loads the data into Python. The
# json_data.close() function closes the connection to the file.
# The pandas library also provides a function to read the JSON file, which
# can be accessed using pd.read_json() .

In [16]:
# Database
# To read data from a database, the following function can be used:
# >>> pd.read_sql_table(table_name, con)
# The preceding command generates a DataFrame. If a table name and an
# SQLAlchemy engine are given, they return a DataFrame. This function
# does not support the DBAPI connection. The following are the description
# of the parameters used:
# table_name : This refers to the name
# con : This refers to the SQLAlchemy of the SQL table in a database
# engine
# The following command reads SQL query into a DataFrame:
# >>> pd.read_sql_query(sql, con)
# The following are the description of the parameters used:
# sql : This refers to the SQL query that is to be executed
# con : This refers to the SQLAlchemy engine

In [17]:
# to check if the location column has missing
# value, the following command can be utilized:
# >>> d['Location 1'].isnull()
# 0 False
# 1 False
# 2 False
# 3 False
# 4 False
# 5 False
# 6 False

In [18]:
# To remove the rows, execute the following command:
# >>> d = d['Location 1'].dropna()
# To remove all the rows with an instance of missing values, use the
# following command:
# >>> d = d.dropna(how='any')

In [22]:
df2 = pd.DataFrame(np.random.randn(5, 3), index = ['a0', 'a10', 'a20', 'a30', 'a40'], columns=['X', 'Y', 'Z'])

In [23]:
df2

Unnamed: 0,X,Y,Z
a0,0.297167,-0.78629,1.245026
a10,-0.488045,-0.354873,0.521944
a20,-0.714129,1.66985,0.286258
a30,1.940126,1.208121,1.465436
a40,-0.968899,-1.27811,1.626933


In [24]:
df2 = df2.reindex(['a0', 'a1', 'a10', 'a11', 'a20', 'a21', 'a30', 'a31', 'a40', 'a41'])
df2

Unnamed: 0,X,Y,Z
a0,0.297167,-0.78629,1.245026
a1,,,
a10,-0.488045,-0.354873,0.521944
a11,,,
a20,-0.714129,1.66985,0.286258
a21,,,
a30,1.940126,1.208121,1.465436
a31,,,
a40,-0.968899,-1.27811,1.626933
a41,,,
