# 10 minutes to pandas
Written by: M.Danish Azeem\
Date: 05.12.2023\
Email:m danishazeem365@gmail.com

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
df = sns.load_dataset("iris")
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


# Object creation

In [4]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s


0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df1 = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df1

Unnamed: 0,A,B,C,D
2013-01-01,-0.242641,-0.356031,-0.274859,-1.2594
2013-01-02,-0.125133,-0.738373,0.835656,-0.421399
2013-01-03,1.691598,-0.383428,1.504235,0.565867
2013-01-04,-0.88317,-1.266696,2.326153,2.45535
2013-01-05,-0.119623,0.946998,2.021825,1.850227
2013-01-06,1.209312,-1.545526,-0.264337,-0.158599


In [7]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)


df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [8]:
df = sns.load_dataset("iris")
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [9]:
df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

# Viewing data

In [10]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [11]:
df.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [12]:
df.index

RangeIndex(start=0, stop=150, step=1)

In [13]:
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [14]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [15]:
df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [16]:
df.to_numpy()

array([[5.1, 3.5, 1.4, 0.2, 'setosa'],
       [4.9, 3.0, 1.4, 0.2, 'setosa'],
       [4.7, 3.2, 1.3, 0.2, 'setosa'],
       [4.6, 3.1, 1.5, 0.2, 'setosa'],
       [5.0, 3.6, 1.4, 0.2, 'setosa'],
       [5.4, 3.9, 1.7, 0.4, 'setosa'],
       [4.6, 3.4, 1.4, 0.3, 'setosa'],
       [5.0, 3.4, 1.5, 0.2, 'setosa'],
       [4.4, 2.9, 1.4, 0.2, 'setosa'],
       [4.9, 3.1, 1.5, 0.1, 'setosa'],
       [5.4, 3.7, 1.5, 0.2, 'setosa'],
       [4.8, 3.4, 1.6, 0.2, 'setosa'],
       [4.8, 3.0, 1.4, 0.1, 'setosa'],
       [4.3, 3.0, 1.1, 0.1, 'setosa'],
       [5.8, 4.0, 1.2, 0.2, 'setosa'],
       [5.7, 4.4, 1.5, 0.4, 'setosa'],
       [5.4, 3.9, 1.3, 0.4, 'setosa'],
       [5.1, 3.5, 1.4, 0.3, 'setosa'],
       [5.7, 3.8, 1.7, 0.3, 'setosa'],
       [5.1, 3.8, 1.5, 0.3, 'setosa'],
       [5.4, 3.4, 1.7, 0.2, 'setosa'],
       [5.1, 3.7, 1.5, 0.4, 'setosa'],
       [4.6, 3.6, 1.0, 0.2, 'setosa'],
       [5.1, 3.3, 1.7, 0.5, 'setosa'],
       [4.8, 3.4, 1.9, 0.2, 'setosa'],
       [5.0, 3.0, 1.6, 0.

In [17]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [18]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
sepal_length,5.1,4.9,4.7,4.6,5.0,5.4,4.6,5.0,4.4,4.9,...,6.7,6.9,5.8,6.8,6.7,6.7,6.3,6.5,6.2,5.9
sepal_width,3.5,3.0,3.2,3.1,3.6,3.9,3.4,3.4,2.9,3.1,...,3.1,3.1,2.7,3.2,3.3,3.0,2.5,3.0,3.4,3.0
petal_length,1.4,1.4,1.3,1.5,1.4,1.7,1.4,1.5,1.4,1.5,...,5.6,5.1,5.1,5.9,5.7,5.2,5.0,5.2,5.4,5.1
petal_width,0.2,0.2,0.2,0.2,0.2,0.4,0.3,0.2,0.2,0.1,...,2.4,2.3,1.9,2.3,2.5,2.3,1.9,2.0,2.3,1.8
species,setosa,setosa,setosa,setosa,setosa,setosa,setosa,setosa,setosa,setosa,...,virginica,virginica,virginica,virginica,virginica,virginica,virginica,virginica,virginica,virginica


In [19]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,species,sepal_width,sepal_length,petal_width,petal_length
0,setosa,3.5,5.1,0.2,1.4
1,setosa,3.0,4.9,0.2,1.4
2,setosa,3.2,4.7,0.2,1.3
3,setosa,3.1,4.6,0.2,1.5
4,setosa,3.6,5.0,0.2,1.4
...,...,...,...,...,...
145,virginica,3.0,6.7,2.3,5.2
146,virginica,2.5,6.3,1.9,5.0
147,virginica,3.0,6.5,2.0,5.2
148,virginica,3.4,6.2,2.3,5.4


In [20]:
df.sort_values(by="species")

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
27,5.2,3.5,1.5,0.2,setosa
28,5.2,3.4,1.4,0.2,setosa
29,4.7,3.2,1.6,0.2,setosa
30,4.8,3.1,1.6,0.2,setosa
...,...,...,...,...,...
119,6.0,2.2,5.0,1.5,virginica
120,6.9,3.2,5.7,2.3,virginica
121,5.6,2.8,4.9,2.0,virginica
111,6.4,2.7,5.3,1.9,virginica


# Getitem ([])

In [21]:
df["petal_length"]

0      1.4
1      1.4
2      1.3
3      1.5
4      1.4
      ... 
145    5.2
146    5.0
147    5.2
148    5.4
149    5.1
Name: petal_length, Length: 150, dtype: float64

In [22]:
df[0:4]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa


In [23]:
df[4:8]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa


# Selection by label

In [24]:
df.loc[[0]]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa


In [25]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [26]:
df.loc[:, ["sepal_length", "species"]]


Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica


In [27]:
df.loc["1":"3", ["sepal_length", "species"]]

Unnamed: 0,sepal_length,species
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
5,5.4,setosa
6,4.6,setosa
7,5.0,setosa
8,4.4,setosa
9,4.9,setosa
10,5.4,setosa


In [28]:
df.loc[[5], "species"]


5    setosa
Name: species, dtype: object

In [29]:
# df.at["sepal_length"[0], "sepal_length"]

df.at[df.index[4], "sepal_length"]



5.0

# Selection by position

In [30]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [31]:
df.iloc[3]

sepal_length       4.6
sepal_width        3.1
petal_length       1.5
petal_width        0.2
species         setosa
Name: 3, dtype: object

df: This is the DataFrame you're working with.

.iloc: This is a pandas DataFrame attribute that is used for integer-location based indexing.

[4:5, 0:3]: This is the selection part. It consists of two slices separated by a comma. The first slice (4:5) refers to rows, and the second slice (0:3) refers to columns.

4:5: This indicates that you want to select rows starting from index 4 up to (but not including) index 5. In Python, indexing starts from 0, so this is selecting the fifth row of the DataFrame.

0:3: This indicates that you want to select columns starting from index 0 up to (but not including) index 3. It selects the columns at positions 0, 1, and 2.

Putting it all together, the code is selecting a specific subset of your DataFrame, specifically the fifth row and the columns at positions 0, 1, and 2.

In [32]:
df.iloc[4:5, 0:3]


Unnamed: 0,sepal_length,sepal_width,petal_length
4,5.0,3.6,1.4


In [33]:
df.iloc[1:5, 0:3]

Unnamed: 0,sepal_length,sepal_width,petal_length
1,4.9,3.0,1.4
2,4.7,3.2,1.3
3,4.6,3.1,1.5
4,5.0,3.6,1.4


In [34]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,sepal_length,petal_length
1,4.9,1.4
2,4.7,1.3
4,5.0,1.4


[1:3, :]: This is the selection part. It consists of two slices separated by a comma. The first slice (1:3) refers to rows, and the second slice (:) refers to all columns.

1:3: This indicates that you want to select rows starting from index 1 up to (but not including) index 3. In Python, indexing starts from 0, so this is selecting the second and third rows of the DataFrame.

: This indicates that you want to select all columns.

In [35]:
df.iloc[1:3, :]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


In [36]:
df.iloc[:, 1:3]


Unnamed: 0,sepal_width,petal_length
0,3.5,1.4
1,3.0,1.4
2,3.2,1.3
3,3.1,1.5
4,3.6,1.4
...,...,...
145,3.0,5.2
146,2.5,5.0
147,3.0,5.2
148,3.4,5.4


df: This is the DataFrame you're working with.

.iloc: This is a pandas DataFrame attribute that is used for integer-location based indexing.

[1, 1]: This is the selection part. It consists of two indices separated by a comma. The first index (1) refers to the row, and the second index (1) refers to the column.

Putting it all together, the code is selecting the element at the second row and second column of your DataFrame.

In [37]:
df.iloc[1, 1]


3.0

In [38]:
df.iat[1, 1]

3.0

# Boolean indexing

In [39]:
df[df["sepal_width"] > 3.5]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
10,5.4,3.7,1.5,0.2,setosa
14,5.8,4.0,1.2,0.2,setosa
15,5.7,4.4,1.5,0.4,setosa
16,5.4,3.9,1.3,0.4,setosa
18,5.7,3.8,1.7,0.3,setosa
19,5.1,3.8,1.5,0.3,setosa
21,5.1,3.7,1.5,0.4,setosa
22,4.6,3.6,1.0,0.2,setosa


In [40]:
print(df.dtypes)


sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object


In [41]:
df['sepal_length'] = pd.to_numeric(df['sepal_width'], errors='coerce')
df['sepal_length']

0      3.5
1      3.0
2      3.2
3      3.1
4      3.6
      ... 
145    3.0
146    2.5
147    3.0
148    3.4
149    3.0
Name: sepal_length, Length: 150, dtype: float64

In [42]:
# df[df > 0]
selected_columns = ['sepal_length', 'sepal_width']
filtered_df = df["sepal_length" ][df["sepal_width"] > 4]
filtered_df


15    4.4
32    4.1
33    4.2
Name: sepal_length, dtype: float64

The code df2 = df.copy() creates a copy of the DataFrame df and assigns it to the variable df2. This is a common practice when you want to work with a copy of a DataFrame, leaving the original DataFrame unchanged.

Here's what happens in this line:

df: This is the original DataFrame.

.copy(): This is a method in Pandas that creates a deep copy of the DataFrame. A deep copy means that a new copy of the data and the index is created, and changes made to the copy do not affect the original DataFrame, and vice versa.

df2 = ...: The result of the .copy() operation is assigned to the variable df2, so now df2 is an independent copy of df.

# ?

In [43]:
df2 = df.copy()
df2


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,3.5,3.5,1.4,0.2,setosa
1,3.0,3.0,1.4,0.2,setosa
2,3.2,3.2,1.3,0.2,setosa
3,3.1,3.1,1.5,0.2,setosa
4,3.6,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,3.0,3.0,5.2,2.3,virginica
146,2.5,2.5,5.0,1.9,virginica
147,3.0,3.0,5.2,2.0,virginica
148,3.4,3.4,5.4,2.3,virginica


The code df2[df2["petal_width"].isin(["two", "four"])] is using boolean indexing to filter rows in the DataFrame df2 based on whether the values in the "petal_width" column are either "two" or "four".

Let's break it down:

df2["petal_width"]: Selects the "petal_width" column from the DataFrame df2.

.isin(["two", "four"]): Checks whether each value in the "petal_width" column is either "two" or "four". This creates a boolean Series where each element is True if the condition is met and False otherwise.

df2[...]: Uses boolean indexing to filter rows from the DataFrame df2. Only the rows where the condition is True will be included in the result.

This boolean Series is then used for boolean indexing, filtering only the rows in the DataFrame where the condition is True. In the context of your original code (df2[df2["petal_width"].isin(["two", "four"])]), only the rows with "petal_width" values of "two" or "four" will be included in the result.

In [44]:
df2[df2["petal_width"].isin(["two", "four"])]
# filtered_df = df2[df2["petal_width"].isin(["two", "four"])]
# filtered_df
df2

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,3.5,3.5,1.4,0.2,setosa
1,3.0,3.0,1.4,0.2,setosa
2,3.2,3.2,1.3,0.2,setosa
3,3.1,3.1,1.5,0.2,setosa
4,3.6,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,3.0,3.0,5.2,2.3,virginica
146,2.5,2.5,5.0,1.9,virginica
147,3.0,3.0,5.2,2.0,virginica
148,3.4,3.4,5.4,2.3,virginica


In [45]:
df.info

<bound method DataFrame.info of      sepal_length  sepal_width  petal_length  petal_width    species
0             3.5          3.5           1.4          0.2     setosa
1             3.0          3.0           1.4          0.2     setosa
2             3.2          3.2           1.3          0.2     setosa
3             3.1          3.1           1.5          0.2     setosa
4             3.6          3.6           1.4          0.2     setosa
..            ...          ...           ...          ...        ...
145           3.0          3.0           5.2          2.3  virginica
146           2.5          2.5           5.0          1.9  virginica
147           3.0          3.0           5.2          2.0  virginica
148           3.4          3.4           5.4          2.3  virginica
149           3.0          3.0           5.1          1.8  virginica

[150 rows x 5 columns]>

# Setting

In [46]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))

s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [47]:
df["sepal_length"] = s1
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [48]:
df.at["sepal_width"[0], "sepal_length"] = 0
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,,3.5,1.4,0.2,setosa
1,,3.0,1.4,0.2,setosa
2,,3.2,1.3,0.2,setosa
3,,3.1,1.5,0.2,setosa
4,,3.6,1.4,0.2,setosa
...,...,...,...,...,...
146,,2.5,5.0,1.9,virginica
147,,3.0,5.2,2.0,virginica
148,,3.4,5.4,2.3,virginica
149,,3.0,5.1,1.8,virginica


In [49]:
df.iat[0, 1] = 0
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,,0.0,1.4,0.2,setosa
1,,3.0,1.4,0.2,setosa
2,,3.2,1.3,0.2,setosa
3,,3.1,1.5,0.2,setosa
4,,3.6,1.4,0.2,setosa
...,...,...,...,...,...
146,,2.5,5.0,1.9,virginica
147,,3.0,5.2,2.0,virginica
148,,3.4,5.4,2.3,virginica
149,,3.0,5.1,1.8,virginica


In [50]:
df.loc[:, "petal_width"] = np.array([5] * len(df))
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,,0.0,1.4,5.0,setosa
1,,3.0,1.4,5.0,setosa
2,,3.2,1.3,5.0,setosa
3,,3.1,1.5,5.0,setosa
4,,3.6,1.4,5.0,setosa
...,...,...,...,...,...
146,,2.5,5.0,5.0,virginica
147,,3.0,5.2,5.0,virginica
148,,3.4,5.4,5.0,virginica
149,,3.0,5.1,5.0,virginica


In [None]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2


In [63]:
# # Assuming "column_name" is a column with numerical values
# df2["sepal_width"] = pd.to_numeric(df2["sepal_width"], errors="coerce")
# df2[df2 > 0] = -df2

# Assuming "sepal_width" is the column with mixed data types
# df2["sepal_width"] = pd.to_numeric(df2["sepal_width"], errors="coerce")

# # Now perform the negation operation
# df2[df2 > 0] = -df2


# Assuming "sepal_width" is the column with mixed data types
df2["sepal_width"] = pd.to_numeric(df2["sepal_width"], errors="coerce")

# Filter rows where "sepal_width" is greater than 0 and perform negation
mask = df2["sepal_width"] > 0
df2.loc[mask, "sepal_width"] = -df2.loc[mask, "sepal_width"]
df2




Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,data1,species1,column_name
0,,0.0,1.4,5.0,setosa,,0.0,0.0
1,,-3.0,1.4,5.0,setosa,,3.0,3.0
2,,-3.2,1.3,5.0,setosa,,3.2,3.2
3,,-3.1,1.5,5.0,setosa,,3.1,3.1
4,,-3.6,1.4,5.0,setosa,,3.6,3.6
...,...,...,...,...,...,...,...,...
146,,-2.5,5.0,5.0,virginica,,2.5,2.5
147,,-3.0,5.2,5.0,virginica,,3.0,3.0
148,,-3.4,5.4,5.0,virginica,,3.4,3.4
149,,-3.0,5.1,5.0,virginica,,3.0,3.0


# Missing data

In [67]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])

df1.loc[dates[0] : dates[1], "E"] = 1

df1

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,E
2013-01-01,,,,,,1.0
2013-01-02,,,,,,1.0
2013-01-03,,,,,,
2013-01-04,,,,,,
