In [1]:
#!pip install pandas
import pandas as pd
import numpy as np

# Pandas Series
Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The axis labels are collectively referred to as the index. 
 The basic method to create a Series is to call:
pandas.Series(data, index=index)
 Here, data can be many different things:
a Python dict
an ndarray
a scalar value (like 5)

In [2]:
#Series from dict
d = {"a": 0.0, "b": 1.0, "c": 2.0}
s=pd.Series(d, index=["b", "c", "d", "a"])
print(s)

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64


In [3]:
#Dictionary functions applicable to series
print(s["a"])
s['d']=5
print(s)


0.0
b    1.0
c    2.0
d    5.0
a    0.0
dtype: float64


In [4]:
#Series From nd array
s = pd.Series(np.random.rand(5), index=["a", "b", "c", "d", "e"])
t=pd.Series(np.random.randn(5))
print(s)
print(t)

a    0.978440
b    0.882476
c    0.030387
d    0.674967
e    0.764174
dtype: float64
0    0.449555
1   -0.361424
2   -0.892210
3   -0.181097
4    0.302087
dtype: float64


In [5]:
#Series-np array like (array functions applicable on Pandas Series )
print(np.mean(s))
t[t>np.mean(t)]

0.6660886182970035


0    0.449555
4    0.302087
dtype: float64

In [6]:
#Pandas Series from scalar
pd.Series(5.0, index=["a", "b", "c", "d", "e"])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

# Pandas DataFrame- DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. 
Like Series, DataFrame accepts many different kinds of input:
1. Dict of 1D ndarrays, lists, dicts, or Series
2. 2-D numpy.ndarray
3. A Series
4. Another DataFrame
5. Reading files from disk 
For first four methods syntax is:
pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=None)

In [7]:
# Method-1 Pandas DataFrame from dictionary
#1. Dataframe from Dictionary of 1-D array
df1=pd.DataFrame({'a':np.array([1,2]),'b':np.array([3,4])},index=['first','second'])
print(df1)
#2. Dataframe from Dictionary of lists
df2=pd.DataFrame({'a':[1,2],'b':[3,4]},index=['first','second'])
print(df2)
#3. Dataframe from Dictionary of dictionaries
df3=pd.DataFrame({'a':{1:5,2:7},'b':{0:7}})
print(df3)
 

        a  b
first   1  3
second  2  4
        a  b
first   1  3
second  2  4
     a    b
1  5.0  NaN
2  7.0  NaN
0  NaN  7.0


In [8]:
#Method-2 Pandas DataFrome From 2D numpy arrays
df4=pd.DataFrame(np.array([[1,2],[2,3]]),index=['first','second'],columns=['a','b'],dtype=np.float)
print(df4)

          a    b
first   1.0  2.0
second  2.0  3.0


In [9]:
#Method3- Pandas DataFrame from Pandas Series
df5=pd.DataFrame(pd.Series({'a':1,'b':'jasmeet'}))
print(df5)

         0
a        1
b  jasmeet


In [10]:
#Method 4- Pandas DataFrame from another DataFrame
df6=df1.iloc[:,1]
print(df6)
#using Concatenate function
#Synatx-pandas.concat(DataFrames, axis=0, keys=None)
data = [df1["a"], df2["a"]]
headers = ["df1", "df2"]
df7 = pd.concat(data, axis=0, keys=headers) # concatenate row wise
print(df7)
df8 = pd.concat(data, axis=1, keys=headers) # concatenate column wise
print(df8)

first     3
second    4
Name: b, dtype: int32
df1  first     1
     second    2
df2  first     1
     second    2
Name: a, dtype: int64
        df1  df2
first     1    1
second    2    2


# Reading Files From Disk- The pandas I/O API is a set of top level reader functions accessed like pandas.read_csv() that generally return a pandas object. The corresponding writer functions are object methods that are accessed like DataFrame.to_csv(). 
read_csv is the most common method used to read datasets in Data Analytics and Machine learning experiments.
The most common attributes used with read_csv are:
1. filepath_or_buffer: various
2. sep: str, defaults to ',' for read_csv(), \t for read_table()
3. header: int, default 'infer’
4. names: array-like, default None
5. usecols: list-like or callable, default None
6. skiprows: list-like or integer, default None
7. nrows: int, default None
#1 is compuslory rest all are optional

In [11]:
# reading titanic datasets
df=pd.read_csv('C:/Machine Learning/ML_Datasets/titanic.csv')
print(df)
#Download dataset from location :http://localhost:8888/?token=77022e4de4c40c267f839b5e663d236164bd7a8d15c12342

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [12]:
#Arguments with read_csv
df9=pd.read_csv('C:/Machine Learning/ML_Datasets/titanic.csv', 
                header=None, usecols=[0,1,3],names=['a','b','c'], skiprows=10,nrows=100)
# reading only three columns at index 0,1,3 ,overwriting column names to a, b , and c,
#skiping top 10 rows and reading maximum 100 rows
print(df9)

      a  b                                    c
0    10  1  Nasser, Mrs. Nicholas (Adele Achem)
1    11  1      Sandstrom, Miss. Marguerite Rut
2    12  1             Bonnell, Miss. Elizabeth
3    13  0       Saundercock, Mr. William Henry
4    14  0          Andersson, Mr. Anders Johan
..  ... ..                                  ...
95  105  0       Gustafsson, Mr. Anders Vilhelm
96  106  0                Mionoff, Mr. Stoytcho
97  107  1     Salkjelsvik, Miss. Anna Kristine
98  108  1               Moss, Mr. Albert Johan
99  109  0                      Rekic, Mr. Tido

[100 rows x 3 columns]


# to_csv- The Series and DataFrame objects have an instance method to_csv which allows storing the contents of the object as a comma-separated-values file. 
The function takes a number of arguments. Only the first is required.
1. path_or_buf: A string path to the file to write or a file object. 
2. sep : Field delimiter for the output file (default “,”)
3. columns: Columns to write (default None)
4. header: Whether to write out the column names (default True)
5. index: whether to write row (index) names (default True)
6. encoding: a string representing the encoding to use if the contents are non-ASCII, for Python versions prior to 3
#1 is compulsory rest are optional

In [14]:
#writing df9 to csv file
df9.to_csv('C:/Machine Learning/ML_Datasets/jas1.csv',index=False)

# Pandas DataFrame Attributes- Each DataFrame returns some attributes which can be used in analysis


In [15]:
#1. at-Access a single value for a row/column label pair.
df.at[1,'Name'] #accesses name of 1st row

'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'

In [16]:
#2. iat- Access a single value for a row/column pair by integer position.
df.iat[1,3] #Acceses value at 1 row and 3 column

'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'

In [17]:
#3. axes- Return a list representing the axes of the DataFrame
df.axes

[RangeIndex(start=0, stop=891, step=1),
 Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object')]

In [18]:
#4. columns-Returns the column labels of the DataFrame
print(df.columns)
[x.upper() for x in df.columns]

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


['PASSENGERID',
 'SURVIVED',
 'PCLASS',
 'NAME',
 'SEX',
 'AGE',
 'SIBSP',
 'PARCH',
 'TICKET',
 'FARE',
 'CABIN',
 'EMBARKED']

In [19]:
#5. index- The index (row labels) of the DataFrame.
df.index

RangeIndex(start=0, stop=891, step=1)

In [20]:
#6. dtypes- Return the dtypes in the DataFrame
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [21]:
#7. iloc-Purely integer-location based indexing for selection by position
print(df.iloc[1])#prints 1 st row
print(df.iloc[:,:3])# prints all rows and 0,1,2 columns

PassengerId                                                    2
Survived                                                       1
Pclass                                                         1
Name           Cumings, Mrs. John Bradley (Florence Briggs Th...
Sex                                                       female
Age                                                           38
SibSp                                                          1
Parch                                                          0
Ticket                                                  PC 17599
Fare                                                     71.2833
Cabin                                                        C85
Embarked                                                       C
Name: 1, dtype: object
     PassengerId  Survived  Pclass
0              1         0       3
1              2         1       1
2              3         1       3
3              4         1       1
4              5      

In [22]:
#8. loc- Access a group of rows and columns by label(s) or a boolean array.
df.loc[:100,'Survived']


0      0
1      1
2      1
3      1
4      0
      ..
96     0
97     1
98     1
99     0
100    0
Name: Survived, Length: 101, dtype: int64

In [23]:
# 9. ndim, shape, size- returns number of dimensions, shape, and number of elments in dataframe
print(df.ndim)
print(df.shape)
print(df.size)

2
(891, 12)
10692


In [24]:
#10 values- Return a Numpy representation of the DataFrame
print(df.values)
df['Age'].values

[[1 0 3 ... 7.25 nan 'S']
 [2 1 1 ... 71.2833 'C85' 'C']
 [3 1 3 ... 7.925 nan 'S']
 ...
 [889 0 3 ... 23.45 nan 'S']
 [890 1 1 ... 30.0 'C148' 'C']
 [891 0 3 ... 7.75 nan 'Q']]


array([22.  , 38.  , 26.  , 35.  , 35.  ,   nan, 54.  ,  2.  , 27.  ,
       14.  ,  4.  , 58.  , 20.  , 39.  , 14.  , 55.  ,  2.  ,   nan,
       31.  ,   nan, 35.  , 34.  , 15.  , 28.  ,  8.  , 38.  ,   nan,
       19.  ,   nan,   nan, 40.  ,   nan,   nan, 66.  , 28.  , 42.  ,
         nan, 21.  , 18.  , 14.  , 40.  , 27.  ,   nan,  3.  , 19.  ,
         nan,   nan,   nan,   nan, 18.  ,  7.  , 21.  , 49.  , 29.  ,
       65.  ,   nan, 21.  , 28.5 ,  5.  , 11.  , 22.  , 38.  , 45.  ,
        4.  ,   nan,   nan, 29.  , 19.  , 17.  , 26.  , 32.  , 16.  ,
       21.  , 26.  , 32.  , 25.  ,   nan,   nan,  0.83, 30.  , 22.  ,
       29.  ,   nan, 28.  , 17.  , 33.  , 16.  ,   nan, 23.  , 24.  ,
       29.  , 20.  , 46.  , 26.  , 59.  ,   nan, 71.  , 23.  , 34.  ,
       34.  , 28.  ,   nan, 21.  , 33.  , 37.  , 28.  , 21.  ,   nan,
       38.  ,   nan, 47.  , 14.5 , 22.  , 20.  , 17.  , 21.  , 70.5 ,
       29.  , 24.  ,  2.  , 21.  ,   nan, 32.5 , 32.5 , 54.  , 12.  ,
         nan, 24.  ,

# Indexing and Iteration Functions- Along with at, iat, loc, and iloc used in indexing following methods are also used for iterating the DataFrame. The most common functions are explained below

In [None]:
#1. Head - Return the first n rows (defaults 5).
df.head()

In [None]:
#2. Tail - Return the last n rows (defaults 5).
df.tail(10)

In [None]:
#3. insert- Insert column into DataFrame at specified location
#Syntax-DataFrame.insert(loc, column, value, allow_duplicates=False)
df.insert(len(df.columns),'Ones',1)
df.head()

In [None]:
#4. Items- Iterate over (column name, Series) pairs.
for a,b in df.items():
    print(a,b)


In [None]:
#5. iteritems- Iterate over (column name, Series) pairs.
for a in df.iteritems():
    print(a)

In [None]:
#6. iterrows-Iterate over DataFrame rows as (index, Series) pairs.
for a in df.iterrows():
    print(a)

In [None]:
#7. keys-This gives index for Series, columns for DataFrame
print(df.keys())
output_features=df.keys()[1]
print(output_features)

In [None]:
#8. where- Replace values where the condition is False.
df11 = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
df11.where(df11%3==0, -df11)

# Binary Opeartions- Pandas provide a set of operations that perform element wise opeartions


In [None]:
#1. add- DataFrame.add(other, axis='columns', level=None, fill_value=None)
df['Age']=df['Age'].add(df['Ones'])
df.head()

In [None]:
#2. Subtract- DataFrame.sub(other[, axis, level, fill_value]
df['Age']=df['Age'].subtract(1,axis=0)
df.head()

In [None]:
#3. multiplication- DataFrame.mul(other[, axis, level, fill_value])
#4. divison- DataFrame.div(other[, axis, level, fill_value])

df12= pd.DataFrame({'angles': [0, 3, 4],'degrees': [360, 180, 360]})
df12['angles']=df12['angles'].mul(2)
df12['angles']=df12['angles'].div(2)
print(df12)

In [None]:
#5. Dot- Compute the matrix multiplication between the DataFrame and other.
df12['angles'].dot(df12['degrees'])

In [None]:
#6. Mod- Get Modulo of dataframe and other, element-wise 
df12['degrees'].mod(df12['angles'])

In [None]:
#7. Power-Get Exponential power of dataframe and other, element-wise
df12['degrees'].pow(2)

In [None]:
#8 Greater than- DataFrame.ne(other[, axis, level])
df12['degrees'].gt(200)
# Similarly, we have functions like lt, ge, le, ne, eq for less than, greater than equal to,
#less than eqaul to, not equal to, and equal to respectively