## Basic Python Pandas

In [2]:
import pandas as pd

### Series

In [3]:
# A series is a one dimensional labeled array capable of holding any datatype
series_data = pd.Series([1,2,3,4,5])
series_data

0    1
1    2
2    3
3    4
4    5
dtype: int64

### Data Frame

In [4]:
# Data Frame
# Creating a dataframe
data = {
    'Name': ["Aditya", "Jay", "Niharika"],
    'Age': [19,19,21]
}

# converting the data into dataframe
data_frame = pd.DataFrame(data)
data_frame

Unnamed: 0,Name,Age
0,Aditya,19
1,Jay,19
2,Niharika,21


### Basic Operations

In [7]:
#Reading csv file

df = pd.read_csv('Data/titanic_test.csv')

# Viewing first 5 records of data
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
# Viewing last 5 records of the data
df.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [9]:
# Information about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [10]:
# Indexing and selecting a specific column from dataframe
df['Name']

0                                  Kelly, Mr. James
1                  Wilkes, Mrs. James (Ellen Needs)
2                         Myles, Mr. Thomas Francis
3                                  Wirz, Mr. Albert
4      Hirvonen, Mrs. Alexander (Helga E Lindqvist)
                           ...                     
413                              Spector, Mr. Woolf
414                    Oliva y Ocana, Dona. Fermina
415                    Saether, Mr. Simon Sivertsen
416                             Ware, Mr. Frederick
417                        Peter, Master. Michael J
Name: Name, Length: 418, dtype: object

In [12]:
# Selecting multiple columns
df[['Name','Age']]

Unnamed: 0,Name,Age
0,"Kelly, Mr. James",34.5
1,"Wilkes, Mrs. James (Ellen Needs)",47.0
2,"Myles, Mr. Thomas Francis",62.0
3,"Wirz, Mr. Albert",27.0
4,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0
...,...,...
413,"Spector, Mr. Woolf",
414,"Oliva y Ocana, Dona. Fermina",39.0
415,"Saether, Mr. Simon Sivertsen",38.5
416,"Ware, Mr. Frederick",


In [14]:
# Selecting rows by index
df.iloc[0] # This will display the first row 
df.iloc[0:2] # This will display the first two rows that is 0th and 1st

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


### Data Cleaning

Handling Missing values

In [15]:
# Checking for missing data in the dataset
df.isnull().sum() # this will show the null values with respected to each column

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [26]:
# removing missing values

# dropping based on axis i.e. columns and rows 
df_column_drop = df.dropna(axis='columns') # this will drop the column if there is atleast 1 missing value in the column
df_column_drop

# dropping based on the threshold value
df_thresh_drop = df.dropna(thresh = 2, axis = 'rows') # this will drop the rows where at least 2 missing values are there
df_thresh_drop

# dropping all the na values 
df_drop_all = df.dropna()
df_drop_all


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
12,904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23.0,1,0,21228,82.2667,B45,S
14,906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance...",female,47.0,1,0,W.E.P. 5734,61.1750,E31,S
24,916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48.0,1,3,PC 17608,262.3750,B57 B59 B63 B66,C
26,918,1,"Ostby, Miss. Helene Ragnhild",female,22.0,0,1,113509,61.9792,B36,C
28,920,1,"Brady, Mr. John Bertram",male,41.0,0,0,113054,30.5000,A21,S
...,...,...,...,...,...,...,...,...,...,...,...
404,1296,1,"Frauenthal, Mr. Isaac Gerald",male,43.0,1,0,17765,27.7208,D40,C
405,1297,2,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,20.0,0,0,SC/PARIS 2166,13.8625,D38,C
407,1299,1,"Widener, Mr. George Dunton",male,50.0,1,1,113503,211.5000,C80,C
411,1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,19928,90.0000,C78,Q


In [28]:
# Filling missing values 
# replacing all the missing values with the value 0
df.fillna(0, inplace=True)

Checking the missing values

In [29]:
df.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

Checking for duplicate data

In [32]:
# checking for duplicate values in the dataset
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
413    False
414    False
415    False
416    False
417    False
Length: 418, dtype: bool

In [None]:
# Dropping all the duplicates
df.drop_duplicates()

### Data Tranformation

In [None]:
# Adding new columns
df['Remark'] = ['Injury on left leg', 'Injury on right arm','dead'] # we can add the values as many as want or we can import it from other dataframe column

In [None]:
# Modifying Column
df['Age'] = df['Age'] + 1 # this will add 1 to each record persons age


### Aggregating and Grouping

Aggregate function

In [33]:
df['Age'].mean()
df['Age'].sum()
df['Age'].max()
df['Age'].min()

0.0

Grouping 

In [37]:
df.groupby('Fare').sum()

  df.groupby('Fare').sum()


Unnamed: 0_level_0,PassengerId,Pclass,Age,SibSp,Parch
Fare,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0000,3466,5,109.5,0,0
3.1708,913,3,9.0,0,1
6.4375,2033,6,0.0,1,0
6.4958,1124,3,21.0,1,0
6.9500,1183,3,30.0,0,0
...,...,...,...,...,...
227.5250,1094,1,47.0,1,0
247.5208,1076,1,27.0,1,1
262.3750,5124,5,203.0,4,8
263.0000,1906,2,88.0,4,6
