# Pandas Day 1

## Importing Libraries


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## How to get Pandas Version

In [3]:
# Print only the version of pandas
print(pd.__version__)
print("-------------------------")
# Print the versions of all dependencies
pd.show_versions()

2.2.3
-------------------------

INSTALLED VERSIONS
------------------
commit                : 0691c5cf90477d3503834d983f69350f250a6ff7
python                : 3.13.0
python-bits           : 64
OS                    : Windows
OS-release            : 10
Version               : 10.0.19045
machine               : AMD64
processor             : Intel64 Family 6 Model 69 Stepping 1, GenuineIntel
byteorder             : little
LC_ALL                : None
LANG                  : None
LOCALE                : English_United States.1252

pandas                : 2.2.3
numpy                 : 2.1.3
pytz                  : 2024.2
dateutil              : 2.9.0.post0
pip                   : 25.1.1
Cython                : None
sphinx                : None
IPython               : 8.29.0
adbc-driver-postgresql: None
adbc-driver-sqlite    : None
bs4                   : None
blosc                 : None
bottleneck            : None
dataframe-api-compat  : None
fastparquet           : None
fsspec          

# How to make dataframe

In [4]:
data = {
    "Asad":["Range Rover Sportage","Lamborghini Urus","Rolls Royce Cullinan"],
    "Adnan":["Honda Civic","Honda Accord","Honda Civic"],
    "Aly":["Toyota Corolla","Toyota Camry","Toyota Corolla"],
}
# Create a DataFrame from the dictionary
df = pd.DataFrame(data)
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,Asad,Adnan,Aly
0,Range Rover Sportage,Honda Civic,Toyota Corolla
1,Lamborghini Urus,Honda Accord,Toyota Camry
2,Rolls Royce Cullinan,Honda Civic,Toyota Corolla


In [5]:
# Create a DataFrame using numpy library
data2 = {
    "Name": np.array(["Asad", "Adnan", "Aly"]),
    "Car": np.array(["Range Rover Sportage", "Honda Civic", "Toyota Corolla"]),
    "Price": np.array([100000, 20000, 30000])
}
df2 = pd.DataFrame(data2)
print(df2)
print("-------------------------")
# Create a DataFrame with random numbers)
df3 = pd.DataFrame(np.random.rand(5,5), columns=list("ABCDE"))
print(df3)


    Name                   Car   Price
0   Asad  Range Rover Sportage  100000
1  Adnan           Honda Civic   20000
2    Aly        Toyota Corolla   30000
-------------------------
          A         B         C         D         E
0  0.861060  0.135619  0.975452  0.126621  0.970448
1  0.155828  0.476064  0.775254  0.459984  0.936769
2  0.174714  0.742768  0.687723  0.474426  0.050284
3  0.053857  0.947160  0.301080  0.869092  0.510453
4  0.211571  0.033204  0.670664  0.533778  0.676907


## How to Rename Columns


In [6]:
# Let's again have a look on a dataset
df.head()

Unnamed: 0,Asad,Adnan,Aly
0,Range Rover Sportage,Honda Civic,Toyota Corolla
1,Lamborghini Urus,Honda Accord,Toyota Camry
2,Rolls Royce Cullinan,Honda Civic,Toyota Corolla


In [7]:
# Current name of columns
print(df.columns)
print("----------------")
# Rename the columns of the DataFrame
df.rename(columns={"Asad":"Asad_Dream_Cars","Adnan":"Adnan_Dream_Cars","Aly":"Aly_Dream_Cars"}, inplace=True)
print(df.head())
# Let's recheck the columns names
print(df.columns)


Index(['Asad', 'Adnan', 'Aly'], dtype='object')
----------------
        Asad_Dream_Cars Adnan_Dream_Cars  Aly_Dream_Cars
0  Range Rover Sportage      Honda Civic  Toyota Corolla
1      Lamborghini Urus     Honda Accord    Toyota Camry
2  Rolls Royce Cullinan      Honda Civic  Toyota Corolla
Index(['Asad_Dream_Cars', 'Adnan_Dream_Cars', 'Aly_Dream_Cars'], dtype='object')


# How to use Template Data

In [8]:
# Setting to display all rows
pd.set_option('display.max_rows', None)

In [9]:
# Load a Titanic dataset using seaborn
df = sns.load_dataset('titanic')
df.to_csv('titanic.csv', index=False)
# Let's have a look on the top 5 rows of titanic dataset
print("Titanic Dataset")
print(df.head(5))
print("---------------------------")
# Again load the Iris dataset using seaborn
print("Iris Dataset")
df = sns.load_dataset("iris")
df.to_csv('iris.csv', index=False)
# Let's have a look on the last 5 rows of Iris Dataset
print(df.tail(5))
print("-------------------------")
# Let's get the statistical summary of the Iris Dataset
print(df.describe())
print("-------------------------")
# Let's get the name of columns of the Iris dataset
print(df.columns)
print("-------------------------")
# Let's save the Iris dataset to a CSV file
df.to_csv("Iris.csv")
# Let's save the Iris dataset to an Excel file
df.to_excel("Iris.xlsx")
# Let's read the Iris dataset from a CSV file
df = pd.read_csv("Iris.csv")



Titanic Dataset
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
---------------------------
Iris Dataset
     sepal_length  sepal_width  petal_length  petal_width    species
145           6.7          3.0           5.2          2.3  virginica
146           6.3       

# Reverse Row Order

In [10]:
df = sns.load_dataset("titanic")
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [11]:
# Row reverse order
df = df.loc[::-1]
print(df.head())

     survived  pclass     sex   age  sibsp  parch   fare embarked   class  \
890         0       3    male  32.0      0      0   7.75        Q   Third   
889         1       1    male  26.0      0      0  30.00        C   First   
888         0       3  female   NaN      1      2  23.45        S   Third   
887         1       1  female  19.0      0      0  30.00        S   First   
886         0       2    male  27.0      0      0  13.00        S  Second   

       who  adult_male deck  embark_town alive  alone  
890    man        True  NaN   Queenstown    no   True  
889    man        True    C    Cherbourg   yes   True  
888  woman       False  NaN  Southampton    no  False  
887  woman       False    B  Southampton   yes   True  
886    man        True  NaN  Southampton    no   True  


In [12]:
# Reset index of rows in to its original order
df = df.loc[::-1].reset_index(drop=True)
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [13]:
# Let's reverse the order of columns
df = sns.load_dataset("titanic")  # Ensure df is a DataFrame
df = df[df.columns[::-1]]
print(df.head())

   alone alive  embark_town deck  adult_male    who  class embarked     fare  \
0  False    no  Southampton  NaN        True    man  Third        S   7.2500   
1  False   yes    Cherbourg    C       False  woman  First        C  71.2833   
2   True   yes  Southampton  NaN       False  woman  Third        S   7.9250   
3  False   yes  Southampton    C       False  woman  First        S  53.1000   
4   True    no  Southampton  NaN        True    man  Third        S   8.0500   

   parch  sibsp   age     sex  pclass  survived  
0      0      1  22.0    male       3         0  
1      0      1  38.0  female       1         1  
2      0      0  26.0  female       3         1  
3      0      1  35.0  female       1         1  
4      0      0  35.0    male       3         0  


# Select columns from dataset on the base of datatype

In [14]:
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [15]:
# Selecting specific columns that is numeric
df = df.select_dtypes(include=["number"])
df.columns

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')

In [16]:
# Selecting specific columns that is object
df = sns.load_dataset("titanic")  # Ensure df is a DataFrame
df = df.select_dtypes(include=["object"])
df.columns

Index(['sex', 'embarked', 'who', 'embark_town', 'alive'], dtype='object')

In [17]:
# Selecting specific columns that are neither a number nor an object
df = sns.load_dataset("titanic")  # Ensure df is a DataFrame
df = df.select_dtypes(exclude=["number","object"])
df.columns

Index(['class', 'adult_male', 'deck', 'alone'], dtype='object')

In [18]:
# Selecting specific columns that is boolean
df = sns.load_dataset("titanic")
df = df.select_dtypes(include=["bool"])
df.columns

Index(['adult_male', 'alone'], dtype='object')

# Convert string into numbers

In [19]:
df = pd.DataFrame({"Col_A": [1, 2, 3, 4, 5],"Col_B":["6","7","8","9","10"]})
df.head()

Unnamed: 0,Col_A,Col_B
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [20]:
# Check the data types of the columns
df.dtypes

Col_A     int64
Col_B    object
dtype: object

In [21]:
# To Convert the data type of a column to numeric
df["Col_B"] = pd.to_numeric(df["Col_B"])
# Convert the 'Col_A' column from numeric to object (string) type
df["Col_A"] = df["Col_A"].astype("object")
# Check the data types of the columns after conversion
print(df.dtypes)

Col_A    object
Col_B     int64
dtype: object


# How to check Dataset Shape and Dimensions

In [22]:
df = sns.load_dataset("titanic")
# To check the shape of the DataFrame
print("Shape of the DataFrame:", df.shape)
# To get the information about the DataFrame
print("Information about the DataFrame:")
df.info()
# To get the number of rows and columns in the DataFrame    
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

Shape of the DataFrame: (891, 15)
Information about the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
Number of ro

# How to copy data on a clipboard in python

In [23]:


# Create a sample DataFrame
df = pd.DataFrame({
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35]
})

# Copy DataFrame to clipboard
df.to_clipboard(index=False)

# Now, you can paste this data into Excel or Notepad.
# To read data from clipboard back into a DataFrame:
df_from_clipboard = pd.read_clipboard()
print(df_from_clipboard)

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


# Split Dataset into Two Subset

In [24]:
df = sns.load_dataset("titanic")
# Display the first few rows of the DataFrame
print(df.head())
# Shape of DataFrame
print("Shape of the DataFrame:", df.shape)

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
Shape of the DataFrame: (891, 15)


In [25]:
# Splitting DataFrame into two halves
from random import random
df1 = df.sample(frac=0.5, random_state=1)  # First half
df2 = df.drop(df1.index)  # Second half
# Display the first few rows of each half
print("First Half of DataFrame:")
print(df1.shape)
print(df1.head())

print("Second Half of DataFrame:")
print(df2.shape)
print(df2.head())



First Half of DataFrame:
(446, 15)
     survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
862         1       1  female  48.0      0      0  25.9292        S   First   
223         0       3    male   NaN      0      0   7.8958        S   Third   
84          1       2  female  17.0      0      0  10.5000        S  Second   
680         0       3  female   NaN      0      0   8.1375        Q   Third   
535         1       2  female   7.0      0      2  26.2500        S  Second   

       who  adult_male deck  embark_town alive  alone  
862  woman       False    D  Southampton   yes   True  
223    man        True  NaN  Southampton    no   True  
84   woman       False  NaN  Southampton   yes   True  
680  woman       False  NaN   Queenstown    no   True  
535  child       False  NaN  Southampton   yes  False  
Second Half of DataFrame:
(445, 15)
    survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
1          1       1  female  38.0      

# Joining Two Datasets


In [26]:
dff = pd.concat([df1, df2], ignore_index=True)  # Concatenate the two halves
# Display the shape of the concatenated DataFrame
print("Concatenated DataFrame Shape:", dff.shape)
# Display the first few rows of the concatenated DataFrame
print("Concatenated DataFrame:", dff.head())

Concatenated DataFrame Shape: (891, 15)
Concatenated DataFrame:    survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0         1       1  female  48.0      0      0  25.9292        S   First   
1         0       3    male   NaN      0      0   7.8958        S   Third   
2         1       2  female  17.0      0      0  10.5000        S  Second   
3         0       3  female   NaN      0      0   8.1375        Q   Third   
4         1       2  female   7.0      0      2  26.2500        S  Second   

     who  adult_male deck  embark_town alive  alone  
0  woman       False    D  Southampton   yes   True  
1    man        True  NaN  Southampton    no   True  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False  NaN   Queenstown    no   True  
4  child       False  NaN  Southampton   yes  False  


# Filtering in a Dataset

In [27]:
df = sns.load_dataset("titanic")
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [28]:
# To check unique values in a sex column
df["sex"].unique()

array(['male', 'female'], dtype=object)

In [29]:
# Select only data of males
df_males = df[df["sex"] == "male"]
print(df_males.head())

   survived  pclass   sex   age  sibsp  parch     fare embarked  class    who  \
0         0       3  male  22.0      1      0   7.2500        S  Third    man   
4         0       3  male  35.0      0      0   8.0500        S  Third    man   
5         0       3  male   NaN      0      0   8.4583        Q  Third    man   
6         0       1  male  54.0      0      0  51.8625        S  First    man   
7         0       3  male   2.0      3      1  21.0750        S  Third  child   

   adult_male deck  embark_town alive  alone  
0        True  NaN  Southampton    no  False  
4        True  NaN  Southampton    no   True  
5        True  NaN   Queenstown    no   True  
6        True    E  Southampton    no   True  
7       False  NaN  Southampton    no  False  


In [30]:
# Select the passengers who are from Southampton
df[df["embark_town"] == "Southampton"].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True


In [31]:
# Select the passengers who are females that are 35 years old and from Southampton or Queenstown
df[(df["sex"] == "female") & (df["age"] == 35) & (df["embark_town"].isin(["Southampton", "Queenstown"]))]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
211,1,2,female,35.0,0,0,21.0,S,Second,woman,False,,Southampton,yes,True
230,1,1,female,35.0,1,0,83.475,S,First,woman,False,C,Southampton,yes,False
269,1,1,female,35.0,0,0,135.6333,S,First,woman,False,C,Southampton,yes,True
279,1,3,female,35.0,1,1,20.25,S,Third,woman,False,,Southampton,yes,False
383,1,1,female,35.0,1,0,52.0,S,First,woman,False,,Southampton,yes,False
486,1,1,female,35.0,1,0,90.0,S,First,woman,False,C,Southampton,yes,False


In [32]:
df.embark_town.value_counts()

embark_town
Southampton    644
Cherbourg      168
Queenstown      77
Name: count, dtype: int64

In [33]:
df.sex.value_counts()

sex
male      577
female    314
Name: count, dtype: int64

# Splitting a string into different columns

In [34]:
df = pd.DataFrame({"name":["Asad Ali","Ali Zain","Faisal Iqbal","Iqbal Akram","Shoaib Idrees"],"location":["Okara Pakistan","Nankana Pakistan","Pattoki Pakistan","Lahore Pakistan","Bseerpur Pakistan"]})
df

Unnamed: 0,name,location
0,Asad Ali,Okara Pakistan
1,Ali Zain,Nankana Pakistan
2,Faisal Iqbal,Pattoki Pakistan
3,Iqbal Akram,Lahore Pakistan
4,Shoaib Idrees,Bseerpur Pakistan


In [35]:
# Splitting a string name column into first and last name
df[['First_name', 'Last_name']] = df['name'].str.split(' ', n=1, expand=True)
df[["City","Country"]] = df["location"].str.split(" ", n=1, expand=True)
df


Unnamed: 0,name,location,First_name,Last_name,City,Country
0,Asad Ali,Okara Pakistan,Asad,Ali,Okara,Pakistan
1,Ali Zain,Nankana Pakistan,Ali,Zain,Nankana,Pakistan
2,Faisal Iqbal,Pattoki Pakistan,Faisal,Iqbal,Pattoki,Pakistan
3,Iqbal Akram,Lahore Pakistan,Iqbal,Akram,Lahore,Pakistan
4,Shoaib Idrees,Bseerpur Pakistan,Shoaib,Idrees,Bseerpur,Pakistan


In [36]:
# Making a copy of the DataFrame
df = df[["First_name","Last_name","City","Country"]]
print(df.head())

  First_name Last_name      City   Country
0       Asad       Ali     Okara  Pakistan
1        Ali      Zain   Nankana  Pakistan
2     Faisal     Iqbal   Pattoki  Pakistan
3      Iqbal     Akram    Lahore  Pakistan
4     Shoaib    Idrees  Bseerpur  Pakistan


# Aggregate by multiple groups/functions

In [37]:
df = sns.load_dataset("titanic")
# Display the first few rows of the DataFrame
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [38]:
# Group the DataFrame by the "who" column and count the number of non-null values in each column for each group
df.groupby("who").count()

Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,adult_male,deck,embark_town,alive,alone
who,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
child,83,83,83,83,83,83,83,83,83,83,13,83,83,83
man,537,537,537,413,537,537,537,537,537,537,99,537,537,537
woman,271,271,271,218,271,271,271,269,271,271,91,269,271,271


In [39]:
# Group the DataFrame by the "sex" and "embark_town" columns,
# and count the number of non-null values in each column for each group
print(df.groupby(["sex", "embark_town"]).count())

                    survived  pclass  age  sibsp  parch  fare  embarked  \
sex    embark_town                                                        
female Cherbourg          73      73   61     73     73    73        73   
       Queenstown         36      36   12     36     36    36        36   
       Southampton       203     203  186    203    203   203       203   
male   Cherbourg          95      95   69     95     95    95        95   
       Queenstown         41      41   16     41     41    41        41   
       Southampton       441     441  368    441    441   441       441   

                    class  who  adult_male  deck  alive  alone  
sex    embark_town                                              
female Cherbourg       73   73          73    37     73     73  
       Queenstown      36   36          36     2     36     36  
       Southampton    203  203         203    56    203    203  
male   Cherbourg       95   95          95    32     95     95  
       Qu

# How to select specific rows and columns

In [40]:
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [41]:
# To select a specific column in a dataset
df[["sex","age"]].head()

Unnamed: 0,sex,age
0,male,22.0
1,female,38.0
2,female,26.0
3,female,35.0
4,male,35.0


In [42]:
# To get the min max of numeric columns
df.describe().loc[["min","max"]]

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
min,0.0,1.0,0.42,0.0,0.0,0.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [43]:
# To get the statistical summary of the DataFrame
print(df.describe())

         survived      pclass         age       sibsp       parch        fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200


In [44]:
# Another way to get the statistical summary of the DataFrame
print(df.describe().loc["min":"max"])

     survived  pclass     age  sibsp  parch      fare
min       0.0     1.0   0.420    0.0    0.0    0.0000
25%       0.0     2.0  20.125    0.0    0.0    7.9104
50%       0.0     3.0  28.000    0.0    0.0   14.4542
75%       1.0     3.0  38.000    1.0    0.0   31.0000
max       1.0     3.0  80.000    8.0    6.0  512.3292


# Reshape Multi Index Series

In [45]:
# Load titanic dataset using seaborn
df = sns.load_dataset("titanic")
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [47]:
# Calculate and display the mean value of the 'survived' column
print("Mean of survived column:", df["survived"].mean())

# Calculate and display the mean value of the 'survived' column for male passengers
print("Mean of survived column of males:", df[df["sex"] == "male"]["survived"].mean())

# Group the data by 'sex' and 'pclass', and calculate the mean survival rate for each group
print("Mean survival rate by sex and class:", df.groupby(['sex', 'pclass']).survived.mean())

Mean of survived column: 0.3838383838383838
Mean of survived column of males: 0.18890814558058924
Mean survival rate by sex and class: sex     pclass
female  1         0.968085
        2         0.921053
        3         0.500000
male    1         0.368852
        2         0.157407
        3         0.135447
Name: survived, dtype: float64


# Numeric to Categoric Feature Conversion

In [49]:
print(df.age.head())

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64


In [57]:
# Creating the Bins of Age and Add a new column 'new_age' to the DataFrame
df["new_age"] = pd.cut(df.age, bins=[0,18,25,50,99], labels=["Child","Young_Adult","Adult","Old"])
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone      new_age  
0    man        True  NaN  Southampton    no  False  Young_Adult  
1  woman       False    C    Cherbourg   yes  False        Adult  
2  woman       False  NaN  Southampton   yes   True        Adult  
3  woman       False    C  Southampton   yes  False        Adult  
4    man        True  NaN  Southampton    no   True        Adult  


In [58]:
# Check the unique values in the new_age column
print("Unique values and thier sum in new_age column:")
print(df["new_age"].value_counts())

Unique values and thier sum in new_age column:
new_age
Adult          349
Young_Adult    162
Child          139
Old             64
Name: count, dtype: int64


# How to Convert One set of values into another One

In [60]:
# Look at the first few rows of the age column in theDataFrame
print(df["sex"].head())

0      male
1    female
2    female
3    female
4      male
Name: sex, dtype: object


In [62]:
df["sex_num"] = df["sex"].map({"male": 0, "female": 1})
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone      new_age  sex_num  
0    man        True  NaN  Southampton    no  False  Young_Adult        0  
1  woman       False    C    Cherbourg   yes  False        Adult        1  
2  woman       False  NaN  Southampton   yes   True        Adult        1  
3  woman       False    C  Southampton   yes  False        Adult        1  
4    man        True  NaN  Southampton    no   True        Adult        0  


In [63]:
# Checking the unique values in embark_town column
print(df["embark_town"].unique())

['Southampton' 'Cherbourg' 'Queenstown' nan]


In [65]:
# The following code converts the 'embark_town' categorical column into numeric values using factorize().
# It creates a new column 'embark_town_value' with the numeric codes.


df["embark_town_value"] = df.embark_town.factorize()[0]
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone      new_age  sex_num  \
0    man        True  NaN  Southampton    no  False  Young_Adult        0   
1  woman       False    C    Cherbourg   yes  False        Adult        1   
2  woman       False  NaN  Southampton   yes   True        Adult        1   
3  woman       False    C  Southampton   yes  False        Adult        1   
4    man        True  NaN  Southampton    no   True        Adult        0   

   embark_town_value  
0                  0  
1                  1  
2          

In [67]:
print(df["embark_town"].value_counts())
print(df["embark_town_value"].value_counts())

embark_town
Southampton    644
Cherbourg      168
Queenstown      77
Name: count, dtype: int64
embark_town_value
 0    644
 1    168
 2     77
-1      2
Name: count, dtype: int64
