# Data Wrangling

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [22]:
kashti = sns.load_dataset('titanic')
k1 = kashti
k2 = kashti
k3 = kashti
k1.shape
# k4 = kashti
# k4


(891, 15)

## Dealing with Missing Values
 
 * In a data set values are missing either, N/A, NaN, 0 or empty cell.

In [3]:

# simple operation on column (Math Operatore)
k1 = kashti
(k1['age'] + 1).head()



0    23.0
1    39.0
2    27.0
3    36.0
4    36.0
Name: age, dtype: float64

In [4]:
# give the shape of the date set
k1.shape
# it find the null value in all data set
k1.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [5]:
# isnull is same like isna
k1.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [6]:
# Use dropna to drop all null values
# k3 = k1.dropna()
# k3.shape
# k3

# 'dropna' will drop all the null values from null 'deck' column 
# and also reduce other null values in the data set
k1.dropna(subset=['deck'], axis=0, inplace=True)
k1.shape
k1.isnull().sum()


survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64

In [7]:
k1.describe()
k1.shape

(203, 15)

In [8]:
k1.dropna(subset=['age'], axis=0, inplace=True)

In [9]:
k1.isnull().sum()
k1

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


## Replace Missing values with average of that column

In [10]:
# finding average (mean) of column
mean = k3['age'].mean()  # 29.69
mean

35.77945652173913

In [26]:
# now replace all the Nan values in a column with this mean
k3['age'] = k3['age'].replace(np.nan, mean)

# After replacing all null values with mean of colum
k3.isnull().sum()



survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [35]:
k3.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

## **Assignment**
### Remove deck and embark_town

In [65]:
k5 = sns.load_dataset('titanic')

k5.head()
# Remove deck colums
k5 = k5.drop('deck', axis = 1)
k5.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


In [68]:
k5.drop('embark_town', axis=1)
k5.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


## Data Formatting

* Bring data in one standard form.

In [69]:
k3.dtypes

survived         int64
pclass           int64
sex             object
age            float64
sibsp            int64
parch            int64
fare           float64
class         category
who             object
adult_male        bool
alive           object
alone             bool
dtype: object

In [74]:
# Change type of survived from int into float
k3['survived'] = k3['survived'].astype('float64')

In [75]:
k3.dtypes

survived       float64
pclass           int64
sex             object
age            float64
sibsp            int64
parch            int64
fare           float64
class         category
who             object
adult_male        bool
alive           object
alone             bool
dtype: object

In [82]:
# Conver age from year to day
k3['age' ]  = k3['age'] * 365
k3.rename(columns={'age' : 'age in days'}, inplace=True)

In [83]:
k3.head(3)

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,class,who,adult_male,alive,alone
0,0.0,3,male,2930950.0,1,0,7.25,Third,man,True,no,False
1,1.0,1,female,5062550.0,1,0,71.2833,First,woman,False,yes,False
2,1.0,3,female,3463850.0,0,0,7.925,Third,woman,False,yes,True


In [93]:
# age and fare values

k3[['age in days','fare']]

Unnamed: 0,age in days,fare
0,2.930950e+06,7.2500
1,5.062550e+06,71.2833
2,3.463850e+06,7.9250
3,4.662875e+06,53.1000
4,4.662875e+06,8.0500
...,...,...
886,3.597075e+06,13.0000
887,2.531275e+06,30.0000
888,4.766718e+06,23.4500
889,3.463850e+06,30.0000


## Data Normalization
 
* We normalize the data, and bring in it into the range of 0 - 1

In [98]:
# To bring the fare value b/w 0-1
k3['fare'] = k3['fare'] / k3['fare'].max()
k3['age in days'] = k3['age in days'] / k3['age in days'].max()
k3[['age in days', 'fare']]

Unnamed: 0,age in days,fare
0,0.275000,0.014151
1,0.475000,0.139136
2,0.325000,0.015469
3,0.437500,0.103644
4,0.437500,0.015713
...,...,...
886,0.337500,0.025374
887,0.237500,0.058556
888,0.447243,0.045771
889,0.325000,0.058556


## Binning

* Grouping of values into smaller number of values (bin).
- Converting numeric values into categories (Jawan, Bachay, Boorhay) e.g 1-16, 17- 30 etc
- To have better understanding of groups
    - low vs mid vs high price

In [103]:
bins = np.linspace(min(k5['age']), max(k5['age']), 15000)
age_groups = ['Bacha', 'Jawan', 'Boorhey']
k5['age'] = pd.cut(k5['age'], bins, labels=age_groups, include_lowest=True)

ValueError: Bin labels must be one fewer than the number of bin edges