### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
from scipy.stats.mstats import winsorize
import scipy.stats as stats
from scipy.stats import shapiro
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

### Importing Dataset

In [2]:
train_df = pd.read_csv('./dataset/train.csv')
test_df = pd.read_csv('./dataset/test.csv')

Create a copy of train data frame

In [3]:
copy_train_df = train_df.copy()

In [4]:
# get basics of dataframe
def get_dataframe_shape(df):
    return df.shape

def get_dataframe_info(df):
    return df.info()

def get_describe(df):
    return df.describe()

def get_dataframe_head(df, limit = None):
    return df.head(limit)

def get_dataframe_tail(df, limit = None):
    return df.tail(limit)

def get_dataframe_columns(df):
    return df.columns

def get_features_value_counts(df, column):
    return df[column].value_counts()

def get_dataframe_particular_columns(df, columns, limit = None):
    return df[columns].head(limit)

def check_skew_value(df, column):
    return df[column].skew()

def drop_columns(df, columns):
    return df.drop(columns = [columns], inplace = True)

def numerical_columns(df):
    return df.select_dtypes(include = ['int64', 'float64']).columns

def categorical_columns(df):
    return df.select_dtypes(include = ['object']).columns

In [5]:
def get_boxplot(df, column):
    sns.boxplot(x = df[column], data = df)
    plt.title(column)
    plt.show()

def get_histogram(df, column, bins = None):
    plt.hist(df[column], bins = bins, edgecolor = 'black')
    plt.title(column)
    plt.show()

def get_kde_plot(df, column):
    sns.kdeplot(df[column], fill= True)
    plt.title(column)
    plt.show()
    

In [6]:
get_dataframe_head(copy_train_df, 3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


Copy the dataframe

In [7]:
copy_train_df = train_df.copy()

In [None]:
# get dataframe shape
get_dataframe_shape(copy_train_df)

(891, 12)

In [9]:
get_describe(copy_train_df)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Remove unwanted columns

In [11]:
drop_columns(copy_train_df, 'PassengerId')

Print all columns

In [12]:
get_dataframe_columns(copy_train_df)

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Print numerical and categorical columns

In [13]:
print(f"Numerical columns: {numerical_columns(copy_train_df)}")
print(f"Categorical columns: {categorical_columns(copy_train_df)}")

Numerical columns: Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
Categorical columns: Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')


Check duplicate column

In [14]:
duplicates = copy_train_df.duplicated()
copy_train_df[duplicates]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
