# Data wrangling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=sns.load_dataset('titanic')
df.rename(columns={"sex": "gender"}, inplace=True)

In [None]:
df.drop(['deck'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
#use loc function to select rows and columns by label
df.loc[0:4,'age':'fare']

In [None]:
# use iloc function to select rows and columns by position
df.iloc[0:4,0:4]

In [None]:
# if we wanti to excess from 0 to 4 rows then 8 to 12 rows and 0 to 4 columns using loc function
df.loc[np.r_[0:4,8:12],['age','fare']]

In [None]:
# if we wanti to excess from 0 to 4 rows then 8 to 12 rows and 0 to 4 columns
df.iloc[np.r_[0:4,8:12],0:4]


In [None]:
sns.boxplot(df, x='gender', y="age")

In [None]:
# Inter quartile range method
Q1=df['age'].quantile(0.25)
Q3=df['age'].quantile(0.75) 
IQR=Q3-Q1
lower_limit=Q1-1.5*IQR
upper_limit=Q3+1.5*IQR

In [None]:
df=df[(df['age']>lower_limit) & (df['age']<upper_limit)]

In [None]:
Q1=df['fare'].quantile(0.25)
Q3=df['fare'].quantile(0.75) 
IQR=Q3-Q1
lower_limit=Q1-1.5*IQR
upper_limit=Q3+1.5*IQR
df=df[(df['fare']>lower_limit) & (df['fare']<upper_limit)]

In [None]:
df.shape

In [None]:
from scipy import stats

In [None]:
df_duplicates=df[df.duplicated()]
df_duplicates.shape

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
df[['age','fare']]=scaler.fit_transform(df[['age','fare']])
df

# Organizing the data

In [None]:
df["family"]=df['sibsp']+df['parch']

In [None]:
import seaborn as sns
sns.swarmplot(data=df, x="gender", y="family")

In [None]:
sns.swarmplot(data=df, x="gender", y="age",hue='family')

In [None]:
table=pd.pivot_table(df, values="fare", index="pclass", columns="survived", aggfunc=np.sum)
table

# Feature Scaling

## 1. Min Max Scaling

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
# Sample data
data = {'numbers': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)
df.head()

In [None]:
# scale the data using min max scalar
scaler = MinMaxScaler()
df['numbers_scaled'] = scaler.fit_transform(df[['numbers']])
df.head()

# 2. Standard Scalar or Z-score normalization

In [None]:
# scale the data using standard scalar
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df['numbers_scaled'] = scaler.fit_transform(df[['numbers']])
df.head()

## 3. Robust scalar

In [None]:
from sklearn.preprocessing import RobustScaler
# Sample data with outliers

# Robust Scaling
scaler = RobustScaler()
df['numbers_scaled'] = scaler.fit_transform(df[['numbers']])
print(df)

# 4. Logrithmic scaling/ Normalization

In [None]:
import numpy as np
import pandas as pd

#random data with outliers

# Log Transform
df['numbers_log'] = np.log(df['numbers'])
df['numbers_log2'] = np.log2(df['numbers'])
df['numbers_log10'] = np.log10(df['numbers'])
df.head()

# Feature Encoding


1. One hot encoding

In [None]:
import pandas as pd
# Sample data
data = {'Color': ['Red', 'Green', 'Blue', 'Red']}
df = pd.DataFrame(data)
print(df)
# One-Hot Encoding
encoded_data = pd.get_dummies(df, columns=['Color'])
print(encoded_data)

2. Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
# Sample data
data = {'Animal': ['Dog', 'Cat', 'Bird', 'Dog', "lion"]}
df = pd.DataFrame(data)
print(df)

# Label Encoding
label_encoder = LabelEncoder()
df['Animal_encoded'] = label_encoder.fit_transform(df['Animal'])
print(df)

# 3.Ordinal Encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder
# Sample data
data = {'Size': ['Small', 'Medium', 'Large', 'Medium']}
df = pd.DataFrame(data)
print(df)

# Ordinal Encoding
ordinal_encoder = OrdinalEncoder(categories=[['Small', 'Medium', 'Large']])
df['Size_encoded'] = ordinal_encoder.fit_transform(df[['Size']])
print(df)