In [9]:
import pandas as pd
import numpy as np

In [3]:
data = [[1, 2, "A"], 
        [3, 4, "B"]]

df = pd.DataFrame(data, 
                  columns = ["col1", "col2", "col3"])
print(df)

   col1  col2 col3
0     1     2    A
1     3     4    B


In [4]:
data = {'col1': [1, 2], 
        'col2': [3, 4], 
        'col3': ["A", "B"]}

df = pd.DataFrame(data=data)
print(df)

   col1  col2 col3
0     1     3    A
1     2     4    B


In [5]:
print(df)

print("Shape:", df.shape)

   col1  col2 col3
0     1     3    A
1     2     4    B
Shape: (2, 3)


In [6]:
print(df.head(5))

   col1  col2 col3
0     1     3    A
1     2     4    B


In [7]:
df.dtypes

col1     int64
col2     int64
col3    object
dtype: object

In [10]:
df["col1"] = df["col1"].astype(np.int8)

print(df.dtypes)

col1      int8
col2     int64
col3    object
dtype: object


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col1    2 non-null      int8  
 1   col2    2 non-null      int64 
 2   col3    2 non-null      object
dtypes: int64(1), int8(1), object(1)
memory usage: 162.0+ bytes


In [12]:
print(df.describe())

           col1      col2
count  2.000000  2.000000
mean   1.500000  3.500000
std    0.707107  0.707107
min    1.000000  3.000000
25%    1.250000  3.250000
50%    1.500000  3.500000
75%    1.750000  3.750000
max    2.000000  4.000000


In [13]:
df = pd.DataFrame([[1, 2, "A"], [np.nan, 4, "B"]], 
                  columns = ["col1", "col2", "col3"])
print(df)

   col1  col2 col3
0   1.0     2    A
1   NaN     4    B


In [14]:
df.fillna(0, inplace = True)
print(df)

   col1  col2 col3
0   1.0     2    A
1   0.0     4    B


In [15]:
df = pd.DataFrame([[1, 2,  "A"], 
                   [5, 8,  "B"], 
                   [3, 10, "B"]], 
                  columns = ["col1", "col2", "col3"])

print(df.sort_values("col1"))

   col1  col2 col3
0     1     2    A
2     3    10    B
1     5     8    B


In [16]:
df = pd.DataFrame([[1, 2,  "A"], 
                   [5, 8,  "B"], 
                   [3, 10, "B"]], 
                  columns = ["col1", "col2", "col3"])

df.groupby("col3").agg({"col1":sum, "col2":max})

Unnamed: 0_level_0,col1,col2
col3,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1,2
B,8,10


In [17]:
df = pd.DataFrame([[1, 2,  "A"], 
                   [5, 8,  "B"], 
                   [3, 10, "B"]], 
                  columns = ["col_A", "col2", "col3"])

df.rename(columns = {"col_A":"col1"})

Unnamed: 0,col1,col2,col3
0,1,2,A
1,5,8,B
2,3,10,B


In [18]:
df = pd.DataFrame([[1, 2,  "A"], 
                   [5, 8,  "B"], 
                   [3, 10, "B"]], 
                  columns = ["col1", "col2", "col3"])

print(df.drop(columns = ["col1"]))

   col2 col3
0     2    A
1     8    B
2    10    B


In [19]:
df = pd.DataFrame([[1, 2], [3, 4]], 
                  columns = ["col1", "col2"])

df["col3"] = df["col1"] + df["col2"]
print(df)

   col1  col2  col3
0     1     2     3
1     3     4     7


In [20]:
df = pd.DataFrame([[1, 2], [3, 4]], 
                  columns = ["col1", "col2"])

df = df.assign(col3 = df["col1"] + df["col2"])

print(df)

   col1  col2  col3
0     1     2     3
1     3     4     7


In [21]:
df = pd.DataFrame([[1, 2,  "A"], 
                   [5, 8,  "B"], 
                   [3, 10, "B"]], 
                  columns = ["col1", "col2", "col3"])

print(df[df["col2"] > 5])

   col1  col2 col3
1     5     8    B
2     3    10    B


In [22]:
df = pd.DataFrame([[1, 2,  "A"], 
                   [5, 8,  "B"], 
                   [3, 10, "C"]], 
                  columns = ["col1", "col2", "col3"])

filter_list = ["A", "C"]
print(df[df["col3"].isin(filter_list)])

   col1  col2 col3
0     1     2    A
2     3    10    C


In [23]:
df["col1"]

0    1
1    5
2    3
Name: col1, dtype: int64

In [24]:
df = pd.DataFrame([[6, 5,  10], 
                   [5, 8,  6], 
                   [3, 10, 4]], 
                  columns = ["Maths", "Science", "English"],
                  index = ["John", "Mark", "Peter"])

print(df)

       Maths  Science  English
John       6        5       10
Mark       5        8        6
Peter      3       10        4


In [25]:
df.loc["John"]

Maths       6
Science     5
English    10
Name: John, dtype: int64

In [26]:
df.loc["Mark", ["Maths", "English"]]

Maths      5
English    6
Name: Mark, dtype: int64

In [27]:
df.iloc[0]

Maths       6
Science     5
English    10
Name: John, dtype: int64

In [28]:
df = pd.DataFrame([[1, 2,  "A"], 
                   [5, 8,  "B"], 
                   [3, 10, "A"]], 
                  columns = ["col1", "col2", "col3"])

df["col3"].unique()

array(['A', 'B'], dtype=object)

In [29]:
df["col3"].nunique()

2

In [30]:
def add_cols(row):
    return row.col1 + row.col2

df = pd.DataFrame([[1, 2], 
                   [5, 8], 
                   [3, 9]], 
                  columns = ["col1", "col2"])
                  
df["col3"] = df.apply(add_cols, axis=1)
print(df)

   col1  col2  col3
0     1     2     3
1     5     8    13
2     3     9    12


In [31]:
def square_col(num):
    return num**2

df = pd.DataFrame([[1, 2], 
                   [5, 8], 
                   [3, 9]], 
                  columns = ["col1", "col2"])
                  
df["col3"] = df.col1.apply(square_col)
print(df)

   col1  col2  col3
0     1     2     1
1     5     8    25
2     3     9     9


In [32]:
df = pd.DataFrame([[1, "A"], 
                   [2, "B"], 
                   [1, "A"]], 
                  columns = ["col1", "col2"])
                  
df.duplicated(keep=False)

0     True
1    False
2     True
dtype: bool

In [33]:
df = pd.DataFrame([[1, "A"], 
                   [2, "B"], 
                   [1, "A"]], 
                  columns = ["col1", "col2"])
                  
print(df.drop_duplicates())

   col1 col2
0     1    A
1     2    B


In [34]:
df = pd.DataFrame([[1, "A"], 
                   [2, "B"], 
                   [1, "A"]], 
                  columns = ["col1", "col2"])
                  
print(df.value_counts("col2"))

col2
A    2
B    1
dtype: int64


In [35]:
df = pd.DataFrame([[6, 5,  10], 
                   [5, 8,  6], 
                   [3, 10, 4]], 
                  columns = ["col1", "col2", "col3"],
                  index = [2, 3, 1])

print(df.reset_index())

   index  col1  col2  col3
0      2     6     5    10
1      3     5     8     6
2      1     3    10     4


In [36]:
df.reset_index(drop=True)

Unnamed: 0,col1,col2,col3
0,6,5,10
1,5,8,6
2,3,10,4


In [37]:
df = pd.DataFrame([["A", "X"], 
                   ["B", "Y"], 
                   ["C", "X"],
                   ["A", "X"]], 
                  columns = ["col1", "col2"])

print(pd.crosstab(df.col1, df.col2))

col2  X  Y
col1      
A     2  0
B     0  1
C     1  0
