In [None]:
import pandas as pd

df = pd.read_csv('winequality-white.csv', sep=';')
df.head()

In [None]:
df.shape

In [None]:
df.columns.values

In [None]:
df.info()
# Number of rows = 4898
# Number of columns = 12
# Column name, number of not null value, data type 
# There are no missing values in the data
# Number of columns for each data type
# Memory usage

In [None]:
df.describe()
# Median is less than the mean for all columns
# Notably large diffference between the 75%ile and max value of the columns
# 'residual sugar', 'free sulphur dioxide', and 'total sulfur dioxide'
# This suggests that there are outliers in our data

In [None]:
df['quality'].unique()

In [None]:
df['quality'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df[df['quality'].isnull()]

In [None]:
df.corr()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6,4))
sns.heatmap(df.corr(), cmap='Blues')
plt.show()

In [None]:
for i in range(df.shape[1]):
    sns.displot(df.iloc[:, i], kde=True)
    plt.show()

# Missing Data

In [None]:
import seaborn as sns

df = sns.load_dataset('titanic')
df.head()

In [None]:
df.isnull().sum()

In [None]:
# Delete the columns with missing data

newdf = df.dropna(axis=1)
print(df.shape)
print(newdf.shape)

In [None]:
# Delete the rows containing null values

newdf = df.dropna(axis=0)
print(df.shape)
print(newdf.shape)

In [None]:
# Fill with mean/median value

newdf = df.copy()
agemean = newdf['age'].mean()
newdf['age'] = newdf['age'].fillna(agemean)

In [None]:
df.isnull().sum()

In [None]:
newdf.isnull().sum()

In [None]:
agemean

In [None]:
newdf[df['age'].isnull()].head()

In [None]:
# Filling with value of highest frequency

hf = df['embark_town'].value_counts().idxmax()
newdf['embark_town'] = newdf['embark_town'].fillna(hf)

In [None]:
df.isnull().sum()

In [None]:
newdf.isnull().sum()

In [None]:
hf

In [None]:
newdf[df['embark_town'].isnull()]

# Handling Outliers

In [None]:
from sklearn.datasets import fetch_california_housing

cal = fetch_california_housing()
cal

In [None]:
import pandas as pd

df = pd.DataFrame(cal.data, columns=cal.feature_names)
df.head()

In [None]:
import seaborn as sns

sns.boxplot(df['MedInc'])

In [None]:
import numpy as np

np.where(df['MedInc']>10)[0]

In [None]:
newdf = df.drop(np.where(df['MedInc']>10)[0], axis=0)
print(df.shape)
print(newdf.shape)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(df['HouseAge'], df['Population'])
plt.show()

In [None]:
# Z-score

from scipy import stats

z = np.abs(stats.zscore(df['MedInc']))
z

In [None]:
np.where(z>3)[0]

In [None]:
newdf = df.drop(np.where(z>3)[0], axis=0)
print(df.shape)
print(newdf.shape)

In [None]:
q1 = np.percentile(df['MedInc'], 25, 
                   interpolation='midpoint')
q3 = np.percentile(df['MedInc'], 75, 
                   interpolation='midpoint')
iqr = q3 - q1
upper = q3+1.5*iqr
lower = q1 - 1.5*iqr

In [None]:
outliers = np.where((df['MedInc']<lower)|(df['MedInc']>upper))[0]

In [None]:
newdf = df.drop(outliers, axis=0)
print(df.shape)
print(newdf.shape)

# Handling Skewness

In [None]:
import pandas as pd

df = pd.read_csv('Data_to_Transform.csv')
df.head()

In [None]:
df.hist(grid=False, bins=30, figsize=(10,6))

In [None]:
df.agg(['skew', 'kurtosis']).transpose()

In [None]:
df_sqrt = pd.DataFrame()
df_sqrt['A'] = df.iloc[:, 0]
df_sqrt['B'] = df.iloc[:, 2]
df_sqrt.head()

In [None]:
import numpy as np
df_sqrt['A_sqrt'] = np.sqrt(df_sqrt['A'])
df_sqrt.hist(grid=False, bins=30)

In [None]:
df_sqrt['B_sqrt']=np.sqrt(max(df_sqrt['B']+1)-df_sqrt['B'])
df_sqrt.hist(grid=False, bins=30)

In [None]:
df_log = pd.DataFrame()
df_log['A'] = df_sqrt['A']
df_log['B'] = df_sqrt['B']
df_log['A_log'] = np.log(df_log['A'])
df_log.hist(grid=False, bins=30)

In [None]:
df_log['B_log'] = np.log(max(df_log['B']+1)-df_log['B'])
df_log.hist(grid=False, bins=30)

In [None]:
df_boxcox = pd.DataFrame()
df_boxcox['A'] = df.iloc[:, 1]
df_boxcox.head()

In [None]:
from scipy.stats import boxcox

df_boxcox['A_boxcox'] = boxcox(df_boxcox['A'])[0]
df_boxcox.hist(grid=False, bins=30)

# Label Encoding

In [None]:
import seaborn as sns

df = sns.load_dataset('iris')
df.head()

In [None]:
df['species'].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])
df['species'].unique()

# Scaling

In [None]:
import pandas as pd

df = pd.read_csv('data.csv')
df.head()

In [None]:
x = df[['Weight', 'Volume']]
x.head()

In [None]:
from sklearn.preprocessing import StandardScaler

s = StandardScaler()
scaled_x = s.fit_transform(x)
scaled_x

In [None]:
scaled_x.mean()

In [None]:
scaled_x.std()

In [None]:
from sklearn.preprocessing import MinMaxScaler

m = MinMaxScaler()
x_scaled = m.fit_transform(x)
x_scaled

In [None]:
print(x_scaled.min())
print(x_scaled.max())

# Feature Engineering

In [None]:
import pandas as pd

df = pd.read_csv('supermarket_sales.csv')
df.head()

In [None]:
grouped_df = df.groupby('Branch')

In [None]:
df[['tax_mean', 'unit_price_mean']] = grouped_df[['Tax 5%', 
                            'Unit price']].transform('mean')
df[['Tax 5%', 'Unit price', 'Branch', 'tax_mean', 
   'unit_price_mean']].head()

In [None]:
import numpy as np
df['unit_price_50']=np.where(df['Unit price']>50, 1, 0)
df[['Unit price', 'unit_price_50']].head()

In [None]:
df['Total cost']=df['Unit price']*df['Quantity']
df[['Unit price', 'Quantity', 'Total cost']].head()

In [None]:
pd.get_dummies(df['Payment']).head()

In [None]:
df['Date']=pd.to_datetime(df['Date'])
df.dtypes

In [None]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df[['Date', 'Year', 'Month', 'Day']].head()