# Pandas Tutorial

In [1]:
import pandas as pd
# check pandas version
print(pd.__version__)


2.2.1


# Pandas Series (create, manipulate, querry, delete)

In [2]:
# Creating a series from a list
listA=[0,1,2,3,4]
seriesA=pd.Series(listA)

print(seriesA)


0    0
1    1
2    2
3    3
4    4
dtype: int64


In [3]:
# Resetting the index of a series
order= [1,2,3,4,5]
seriesB =pd.Series(listA, index=order) 
print(seriesB)


1    0
2    1
3    2
4    3
5    4
dtype: int64


In [6]:
# using numpy with pandas
import numpy as np
n=np.random.randn(5) # creating a random NDarray

print(n)


[ 0.8321988   0.22844108 -0.03754953 -1.11759963  0.47515751]


In [7]:
# using numpy with pandas
import numpy as np
n=np.random.randn(5) # creating a random NDarray
index= ["a", "b", "c", "d", "e"]
seriesB= pd.Series(n, index=index)
print(seriesB)



a    1.724941
b   -0.459080
c    0.639798
d   -1.051069
e    0.629558
dtype: float64


In [8]:
# Creating a series from a python dictionary
dictA={"a":1, "b":2, "c":3, "d":4, "e":5} # keys and values
seriesC= pd.Series(dictA)

print(seriesC)


a    1
b    2
c    3
d    4
e    5
dtype: int64


In [9]:
# Modifying the index of series in Pandas
print(seriesA) # from above


0    0
1    1
2    2
3    3
4    4
dtype: int64


In [10]:
# Modifying the index of series in Pandas
print(seriesA)
seriesA.index= ["A", "B", "C", "D", "E"]
print(seriesA)


0    0
1    1
2    2
3    3
4    4
dtype: int64
A    0
B    1
C    2
D    3
E    4
dtype: int64


# Basic tricks in Pandas

# Slicing

In [11]:
# slicing-cutting some portion of a series
print(seriesA)


A    0
B    1
C    2
D    3
E    4
dtype: int64


In [12]:
print(seriesA[:3]) # slices upto to the third index


A    0
B    1
C    2
dtype: int64


In [13]:
print(seriesA[:-1]) # slices upto -1 from last index


A    0
B    1
C    2
D    3
dtype: int64


In [14]:
print(seriesA[:-2]) # slices upto -2 from last index


A    0
B    1
C    2
dtype: int64


In [16]:
print(seriesA[2:]) # slices from index 2 upto last index


C    2
D    3
E    4
dtype: int64


In [17]:
print(seriesA[-2:]) # slices from index 2 upto last index


D    3
E    4
dtype: int64


In [18]:
print(seriesA[:]) # selects all indices


A    0
B    1
C    2
D    3
E    4
dtype: int64


In [19]:
print(seriesA[:3]) # slices from first index upto third index


A    0
B    1
C    2
dtype: int64


# Appending:

In [25]:
print(seriesA)

print("\n",seriesC)


A    0
B    1
C    2
D    3
E    4
dtype: int64

 a    1
b    2
c    3
d    4
e    5
dtype: int64


In [29]:
seriesD= seriesA.append(seriesC)
print(seriesD)


AttributeError: 'Series' object has no attribute 'append'

In [None]:
# The error "AttributeError: 'Series' object has no attribute 'append'" occurs because you're trying to use the append() method on a pandas Series object, which is not a valid operation.

# In pandas, you cannot directly append one Series to another using the append() method. The append() method is used for concatenating pandas objects along the row axis (axis=0).

# To concatenate two Series objects, you can use one of the following methods:

#1. Use the pd.concat() function:#

import pandas as pd

seriesA = pd.Series([1, 2, 3])
seriesB = pd.Series([4, 5, 6])

seriesD = pd.concat([seriesA, seriesB], ignore_index=True)
print(seriesD)

#2. Perform element-wise operations (like addition, subtraction, etc.) on the Series objects:#

seriesA = pd.Series([1, 2, 3])
seriesB = pd.Series([4, 5, 6])

seriesD = seriesA.add(seriesB, fill_value=0)
print(seriesD)

# 3. Convert the Series objects to lists, concatenate the lists, and then create a new Series:#
seriesA = pd.Series([1, 2, 3])
seriesB = pd.Series([4, 5, 6])

listD = list(seriesA) + list(seriesB)
seriesD = pd.Series(listD)
print(seriesD)

# These methods will allow you to combine or concatenate two or more Series objects in pandas. Choose the method that best suits your use case.


In [31]:
print(seriesA)

print("\n",seriesC)


A    0
B    1
C    2
D    3
E    4
dtype: int64

 a    1
b    2
c    3
d    4
e    5
dtype: int64


In [34]:
# Trying again:

# instead of this : seriesD= seriesA.append(seriesC); we do the following
seriesD = pd.concat([seriesA, seriesC])
print(seriesD)


A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
e    5
dtype: int64


Dropping one character/index after appending/concatenating series...

In [35]:
print(seriesD.drop("e"))


A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
dtype: int64


# Series Operations in Pandas

In [37]:
array1= [0,1,2,3,4,5,6,7]
array2=[6,7,8,9,4,5]

print(array1)
print("\n", array2)


[0, 1, 2, 3, 4, 5, 6, 7]

 [6, 7, 8, 9, 4, 5]


In [38]:
seriesE=pd.Series(array2)
print(seriesE)


0    6
1    7
2    8
3    9
4    4
5    5
dtype: int64


In [39]:
seriesF=pd.Series(array1)
print(seriesF)


0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
dtype: int64


In [40]:
# performing Addition of two pandas series
print(seriesE.add(seriesF))


0     6.0
1     8.0
2    10.0
3    12.0
4     8.0
5    10.0
6     NaN
7     NaN
dtype: float64


In [41]:
# performing Subtraction of two pandas series
print(seriesE.sub(seriesF))


0    6.0
1    6.0
2    6.0
3    6.0
4    0.0
5    0.0
6    NaN
7    NaN
dtype: float64


In [42]:
# performing Multiplication of two pandas series
print(seriesE.mul(seriesF))


0     0.0
1     7.0
2    16.0
3    27.0
4    16.0
5    25.0
6     NaN
7     NaN
dtype: float64


In [43]:
# performing Division of two pandas series
print(seriesE.div(seriesF))


0    inf
1    7.0
2    4.0
3    3.0
4    1.0
5    1.0
6    NaN
7    NaN
dtype: float64


In [44]:
# performing Division of two pandas series
print(seriesF.median())


3.5


In [47]:
# performing Division of two pandas series
print("Median of series F is:", seriesF.median())
print("Max of series F is:", seriesF.max())
print("Min of series F is:", seriesF.min())


Median of series F is: 3.5
Max of series F is: 7
Min of series F is: 0


# Creating DataFrames in Pandas

In [51]:
# dates
dates= pd.date_range("today's dates", periods= 6) # define time sequence as index
print(dates)


DateParseError: Unknown datetime string format, unable to parse: today's dates

In [52]:
# The error "DateParseError" occurs because the pd.date_range() function is unable to parse the string "today's dates" into a valid date format.

# To solve this issue, you need to provide a valid date string or a datetime object as the start parameter for pd.date_range(). Here's an example of how you can fix the code:

import pandas as pd
from datetime import date

# Get today's date
today = date.today()

# Create a date range starting from today for the next 6 days
dates = pd.date_range(start=today, periods=6)
print(dates)

# In this example, we're using the `date.today()` function from the `datetime` module to get today's date. Then, we pass this date object as the `start` parameter to `pd.date_range()` along with `periods=6` to create a date range for the next 6 days starting from today.


DatetimeIndex(['2024-05-31', '2024-06-01', '2024-06-02', '2024-06-03',
               '2024-06-04', '2024-06-05'],
              dtype='datetime64[ns]', freq='D')


In [53]:
# Alternatively, you can provide an explicit date string in a format that pandas can recognize, like this:

import pandas as pd

# Create a date range starting from a specific date for the next 6 days
dates = pd.date_range(start="2023-05-31", periods=6)
print(dates)

# In this case, the date string "2023-05-31" is in the YYYY-MM-DD format, which pandas can parse correctly.

#By providing a valid date or datetime object as the start parameter, you should be able to resolve the DateParseError and create the desired date range using pd.date_range().


DatetimeIndex(['2023-05-31', '2023-06-01', '2023-06-02', '2023-06-03',
               '2023-06-04', '2023-06-05'],
              dtype='datetime64[ns]', freq='D')


In [55]:
# Create a date range starting from today for the next 6 days
dates = pd.date_range("today", periods=6) # this will print date pplus time
print(dates)


DatetimeIndex(['2024-05-31 16:39:59.764336', '2024-06-01 16:39:59.764336',
               '2024-06-02 16:39:59.764336', '2024-06-03 16:39:59.764336',
               '2024-06-04 16:39:59.764336', '2024-06-05 16:39:59.764336'],
              dtype='datetime64[ns]', freq='D')


In [57]:
# making a vector from dates
dates = pd.date_range("today", periods=6) # this will print date pplus time
num_Array= np.random.randn(6,4) # Imports numpy random array,, with 6rows and 4columns
print(num_Array) 


[[ 9.00450352e-01  7.43906484e-01 -5.75060616e-01 -1.30173847e+00]
 [ 1.04682044e-01 -1.06894241e+00 -1.49954523e+00  1.10308542e-01]
 [ 8.51640951e-01  1.04119725e-03  9.47938659e-02  2.33554344e-01]
 [-8.11583085e-01  3.52524498e-01  1.07944630e+00 -4.54606199e-01]
 [-9.36220042e-01  1.01842560e-01 -2.18058438e-01  5.49072394e-01]
 [-3.77325063e-01  9.18773124e-01  8.50236569e-01 -1.03434920e+00]]


In [60]:
# making a table from dates
dates = pd.date_range("today", periods=6) # this will print date pplus time
num_Array= np.random.randn(6,4) # Imports numpy random array,, with 6rows and 4columns
ListedColumns= ["A", "B", "C", "D"] # use the list as the column name for the table being created

df1=pd.DataFrame(num_Array, index=dates, columns=ListedColumns)
print(df1) 


                                   A         B         C         D
2024-05-31 16:54:09.125595 -0.781370 -1.010170  0.870905 -1.250629
2024-06-01 16:54:09.125595 -1.000946 -0.755037  0.846427  0.417010
2024-06-02 16:54:09.125595  0.118349 -0.666201 -1.922600 -0.245586
2024-06-03 16:54:09.125595  0.588592  1.391292 -0.551370 -0.239988
2024-06-04 16:54:09.125595 -0.462982 -0.059681 -0.109990  0.543915
2024-06-05 16:54:09.125595  0.569426 -1.616121 -0.158076  1.604202


In [61]:
#create dataframe with dictionary array

data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']
        } # np.nan means no value

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] # will be used as index

DataFrame2 = pd.DataFrame(data, index=labels)
print(DataFrame2)

# This code creates a pandas DataFrame `df2` from a dictionary `data` containing lists for each column. The keys of the dictionary ('animal', 'age', 'visits', 'priority') become the column names of the DataFrame, and the corresponding lists are used as the data for each column.

# The `index=labels` parameter assigns the list `labels` as the row indices (row labels) for the DataFrame.


  animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes
c  snake  0.5       2       no
d    dog  NaN       3      yes
e    dog  5.0       2       no
f    cat  2.0       3       no
g  snake  4.5       1       no
h    cat  NaN       1      yes
i    dog  7.0       2       no
j    dog  3.0       1       no


In [62]:
# checking the datatypes of an array
print (DataFrame2.dtypes)


animal       object
age         float64
visits        int64
priority     object
dtype: object


# Printing the head of the data

In [68]:
# printing the main first items/rows
print (DataFrame2.head()) # prints the first 5


  animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes
c  snake  0.5       2       no
d    dog  NaN       3      yes
e    dog  5.0       2       no


In [65]:
print (DataFrame2.head(2)) # prints the first 2


  animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes


In [67]:
# creating a dataframe from a portion of the other
DataFrame3= DataFrame2.head(6)
print (DataFrame3)


  animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes
c  snake  0.5       2       no
d    dog  NaN       3      yes
e    dog  5.0       2       no
f    cat  2.0       3       no


# Printing the tail of the data

In [69]:
# printing the main last items/rows
print (DataFrame2.tail()) # prints the last 5


  animal  age  visits priority
f    cat  2.0       3       no
g  snake  4.5       1       no
h    cat  NaN       1      yes
i    dog  7.0       2       no
j    dog  3.0       1       no


In [71]:
print (DataFrame2.tail(3)) # prints the last 5


  animal  age  visits priority
h    cat  NaN       1      yes
i    dog  7.0       2       no
j    dog  3.0       1       no


In [76]:
print(DataFrame2.index) # prints indexing of the dataframe
print(DataFrame2.columns) # prints column names of the dataframe; we dont put empty braces (call function) at the end because they are objects


Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')
Index(['animal', 'age', 'visits', 'priority'], dtype='object')


In [77]:

print(DataFrame2.values) # prints all value names of the dataframe


[['cat' 2.5 1 'yes']
 ['cat' 3.0 3 'yes']
 ['snake' 0.5 2 'no']
 ['dog' nan 3 'yes']
 ['dog' 5.0 2 'no']
 ['cat' 2.0 3 'no']
 ['snake' 4.5 1 'no']
 ['cat' nan 1 'yes']
 ['dog' 7.0 2 'no']
 ['dog' 3.0 1 'no']]


In [78]:
print(DataFrame2.describe()) # Helps us see the Statistical Data of the dataframe


            age     visits
count  8.000000  10.000000
mean   3.437500   1.900000
std    2.007797   0.875595
min    0.500000   1.000000
25%    2.375000   1.000000
50%    3.000000   2.000000
75%    4.625000   2.750000
max    7.000000   3.000000


# Manipulating the DataFrame

In [79]:
# Transporsing
print(DataFrame2.T) # it flips the columns and the indexes, from [6,4], we make it [4,6]


            a    b      c    d    e    f      g    h    i    j
animal    cat  cat  snake  dog  dog  cat  snake  cat  dog  dog
age       2.5  3.0    0.5  NaN  5.0  2.0    4.5  NaN  7.0  3.0
visits      1    3      2    3    2    3      1    1    2    1
priority  yes  yes     no  yes   no   no     no  yes   no   no


In [80]:
# Sorting the Data Frame
print(DataFrame2.sort_values(by= "age"))


  animal  age  visits priority
c  snake  0.5       2       no
f    cat  2.0       3       no
a    cat  2.5       1      yes
b    cat  3.0       3      yes
j    dog  3.0       1       no
g  snake  4.5       1       no
e    dog  5.0       2       no
i    dog  7.0       2       no
d    dog  NaN       3      yes
h    cat  NaN       1      yes


In [81]:
# Slicing the Data Frame
print(DataFrame2[1:3]) # the last value in the index here will always be "n-1" in python


  animal  age  visits priority
b    cat  3.0       3      yes
c  snake  0.5       2       no


In [82]:
# sorting the sliced dataFrame by age
print(DataFrame2.sort_values(by= "age") [1:3])


  animal  age  visits priority
f    cat  2.0       3       no
a    cat  2.5       1      yes


# Querying the dataFrame

In [83]:
# Querying the dataFrame by tag
print(DataFrame2[["age", "visits"]])


   age  visits
a  2.5       1
b  3.0       3
c  0.5       2
d  NaN       3
e  5.0       2
f  2.0       3
g  4.5       1
h  NaN       1
i  7.0       2
j  3.0       1


In [85]:
# Using the integer location
# Querying the dataFrame by i location#### Almost same as slicing
print(DataFrame2.iloc[1:3]) # Query rows 2,3


  animal  age  visits priority
b    cat  3.0       3      yes
c  snake  0.5       2       no


In [89]:
# Copying dataframe A to dataframe B
DataFrame3= DataFrame2.copy()
print(DataFrame3)


  animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes
c  snake  0.5       2       no
d    dog  NaN       3      yes
e    dog  5.0       2       no
f    cat  2.0       3       no
g  snake  4.5       1       no
h    cat  NaN       1      yes
i    dog  7.0       2       no
j    dog  3.0       1       no


In [91]:
# Looking for the null values in the DataFrame
print(DataFrame3.isnull()) # where no value;(NaN), it'll say True , where we have data it'll show false 


   animal    age  visits  priority
a   False  False   False     False
b   False  False   False     False
c   False  False   False     False
d   False   True   False     False
e   False  False   False     False
f   False  False   False     False
g   False  False   False     False
h   False   True   False     False
i   False  False   False     False
j   False  False   False     False


In [95]:
# changing specific cell in the dataFrame using loc for LOCATION
DataFrame3.loc["f","age"]=1.5 # changes the cell (row-f, column-age)
print(DataFrame3)


  animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes
c  snake  0.5       2       no
d    dog  NaN       3      yes
e    dog  5.0       2       no
f    cat  1.5       3       no
g  snake  4.5       1       no
h    cat  NaN       1      yes
i    dog  7.0       2       no
j    dog  3.0       1       no


In [104]:
# Finding mean of all table columns
print(DataFrame3.mean())


TypeError: Could not convert ['catcatsnakedogdogcatsnakecatdogdog' 'yesyesnoyesnononoyesnono'] to numeric

In [None]:
# The typeError above is because some columns have type-'strings, not integers


In [97]:
# Finding mean of a specific column
# Here we'll consider the age column
DataFrame3[["age"]].mean()


age    3.375
dtype: float64

In [105]:
# Get the sum of a column
DataFrame3[["visits"]].sum()


visits    19
dtype: int64

In [107]:
# Get the sum of all columns
print (DataFrame3.sum())


animal      catcatsnakedogdogcatsnakecatdogdog
age                                       27.0
visits                                      19
priority              yesyesnoyesnononoyesnono
dtype: object


In [109]:
stringA= pd.Series(["A", "B", "C", "ABC", np.nan, "CBA", "cow", "owl"])
print(stringA)


0      A
1      B
2      C
3    ABC
4    NaN
5    CBA
6    cow
7    owl
dtype: object


In [110]:
# making a series lowerCase
stringA= pd.Series(["A", "B", "C", "ABC", np.nan, "CBA", "cow", "owl"])
print(stringA.str.lower())


0      a
1      b
2      c
3    abc
4    NaN
5    cba
6    cow
7    owl
dtype: object


In [112]:
# making a series upperCase
stringA= pd.Series(["A", "B", "C", "ABC", np.nan, "CBA", "cow", "owl"])
print(stringA.str.capitalize())


0      A
1      B
2      C
3    Abc
4    NaN
5    Cba
6    Cow
7    Owl
dtype: object


# Operations for DataFrame missing values

In [113]:
# We will first copy df3 into df4
DataFrame4= DataFrame3.copy()
print(DataFrame4)


  animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes
c  snake  0.5       2       no
d    dog  NaN       3      yes
e    dog  5.0       2       no
f    cat  1.5       3       no
g  snake  4.5       1       no
h    cat  NaN       1      yes
i    dog  7.0       2       no
j    dog  3.0       1       no


In [114]:
# Fill the empty cells this way
DataFrame4= DataFrame3.copy()
print(DataFrame4.fillna(4)) # fills all np.nan with the value we put, like here it's 4


  animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes
c  snake  0.5       2       no
d    dog  4.0       3      yes
e    dog  5.0       2       no
f    cat  1.5       3       no
g  snake  4.5       1       no
h    cat  4.0       1      yes
i    dog  7.0       2       no
j    dog  3.0       1       no


In [116]:
# Get the mean of some cell
DataFrame4= DataFrame3.copy()
meanAge= DataFrame4["age"].mean()
print(DataFrame4["age"].fillna(meanAge))


a    2.500
b    3.000
c    0.500
d    3.375
e    5.000
f    1.500
g    4.500
h    3.375
i    7.000
j    3.000
Name: age, dtype: float64


In [118]:
DataFrame5=DataFrame3.copy()
DataFrame5


Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [119]:
# We drop missing data here
DataFrame5=DataFrame3.copy()
DataFrame5.dropna(how="any") # I expect to lose rows/index "d" and "h" due to NaN


Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
i,dog,7.0,2,no
j,dog,3.0,1,no


# DataFrame file operations

In [120]:
# We save the DataFrame3 table to our storage as CSV file
DataFrame3.to_csv("animal.csv") # you can put in your desired path


In [121]:
df_animal= pd.read_csv("animal.csv")
df_animal.head(3)


Unnamed: 0.1,Unnamed: 0,animal,age,visits,priority
0,a,cat,2.5,1,yes
1,b,cat,3.0,3,yes
2,c,snake,0.5,2,no


In [123]:
# Saving the dataFrame to excel
DataFrame3.to_excel("animal.xlsx", sheet_name="Sheet1")

df_animal2= pd.read_excel("animal.xlsx", "Sheet1", index_col=None, na_values=["NA"]) # most commonly used in excel
print(df_animal2)


  Unnamed: 0 animal  age  visits priority
0          a    cat  2.5       1      yes
1          b    cat  3.0       3      yes
2          c  snake  0.5       2       no
3          d    dog  NaN       3      yes
4          e    dog  5.0       2       no
5          f    cat  1.5       3       no
6          g  snake  4.5       1       no
7          h    cat  NaN       1      yes
8          i    dog  7.0       2       no
9          j    dog  3.0       1       no
