In [78]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
from IPython.display import display

#Extra setting
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth',None)
warnings.filterwarnings("ignore")

In [79]:
#Loading train and test data
try:
    train_df=pd.read_csv('/home/anuragverma/Desktop/Kaggle/Titanic/Titanic_classification_pred-1/train.csv')
    test_df=pd.read_csv('/home/anuragverma/Desktop/Kaggle/Titanic/Titanic_classification_pred-1/test.csv')
    print("Train df shape: " ,train_df.shape)
    print("Test df shape: ", test_df.shape)

except FileNotFoundError:
    print('File not loaded')

Train df shape:  (891, 12)
Test df shape:  (418, 11)


In [80]:
#Null check function
def null_zero_check(df):
    null_values = df.isnull().sum()
    zero_values = null_values[null_values == 0]
    Total_record_count=df.shape[0]
    dataype = df.dtypes
    summary_df = pd.DataFrame({'Null count': null_values,
                               'Null_count_percent': null_values/Total_record_count*100,
                                'Zero count': zero_values,
                                'Zero_count_percent': zero_values/Total_record_count*100,
                                'Datatype': dataype})
    return summary_df

#Crosscheck Feature dTypes
def check_feature_dtypes(train_df, test_df):
    common_cols=train_df.columns.intersection(test_df.columns)
    mismatched_dtypes = []
    for col in common_cols:
        if train_df[col].dtype != test_df[col].dtype:
            mismatched_dtypes.append(
                (col, {"train": train_df[col].dtype, "test": test_df[col].dtype})
            )

    # Return True if no mismatches, False otherwise
    if not mismatched_dtypes:
        return True, mismatched_dtypes
    else:
        return False, mismatched_dtypes
    
#Function to Combine train and test data.Remove target from the train data before combining

def combine_df(df1,df2,tgt_col):
    trin_df_drop=df1.drop([tgt_col],axis=1)
    combined_df=pd.concat([trin_df_drop,df2],axis=0)
    return combined_df

#Function to check for duplicates 
def check_duplicates(df, columns=None):
    if columns is None:
        columns = df.columns

    duplicates = df[df.duplicated(subset=columns, keep=False)]
    return duplicates

#Final report generation
def perform_data_analysis(train_df, test_df):
    print('Null check in train_df:')
    display(null_zero_check(train_df))
    print('Null check in test_df:')
    display(null_zero_check(test_df))
    print('Null check in combined_df:')
    combined_df=combine_df(train_df,test_df,'Survived')
    print('Combined_df shape: ',combined_df.shape)
    print('Is rows in train_df + test_df = combined_df?',train_df.shape[0]+test_df.shape[0]==combined_df.shape[0])
    display(null_zero_check(combined_df))

    print("Check describe for train_df")
    display(train_df.describe(include='all').T)
    print("Check describe for test_df")
    display(test_df.describe(include='all').T)
    print("Check describe for combined_df")
    display(combined_df.describe(include='all').T)

    print('Check for dataype mismatch between Train and Test.If True then no mismatch:',check_feature_dtypes(train_df, test_df))

    print('Check for duplicates in train_df')
    display(check_duplicates(train_df))

    print('Check for duplicates in test_df')
    display(check_duplicates(test_df))  

    print('Check for duplicates in combined_df')
    display(check_duplicates(combined_df))

In [82]:
perform_data_analysis(train_df,test_df)

Null check in train_df:


Unnamed: 0,Null count,Null_count_percent,Zero count,Zero_count_percent,Datatype
Age,177,19.86532,,,float64
Cabin,687,77.104377,,,object
Embarked,2,0.224467,,,object
Fare,0,0.0,0.0,0.0,float64
Name,0,0.0,0.0,0.0,object
Parch,0,0.0,0.0,0.0,int64
PassengerId,0,0.0,0.0,0.0,int64
Pclass,0,0.0,0.0,0.0,int64
Sex,0,0.0,0.0,0.0,object
SibSp,0,0.0,0.0,0.0,int64


Null check in test_df:


Unnamed: 0,Null count,Null_count_percent,Zero count,Zero_count_percent,Datatype
Age,86,20.574163,,,float64
Cabin,327,78.229665,,,object
Embarked,0,0.0,0.0,0.0,object
Fare,1,0.239234,,,float64
Name,0,0.0,0.0,0.0,object
Parch,0,0.0,0.0,0.0,int64
PassengerId,0,0.0,0.0,0.0,int64
Pclass,0,0.0,0.0,0.0,int64
Sex,0,0.0,0.0,0.0,object
SibSp,0,0.0,0.0,0.0,int64


Null check in combined_df:
Combined_df shape:  (1309, 11)
Is rows in train_df + test_df = combined_df? True


Unnamed: 0,Null count,Null_count_percent,Zero count,Zero_count_percent,Datatype
Age,263,20.091673,,,float64
Cabin,1014,77.463713,,,object
Embarked,2,0.152788,,,object
Fare,1,0.076394,,,float64
Name,0,0.0,0.0,0.0,object
Parch,0,0.0,0.0,0.0,int64
PassengerId,0,0.0,0.0,0.0,int64
Pclass,0,0.0,0.0,0.0,int64
Sex,0,0.0,0.0,0.0,object
SibSp,0,0.0,0.0,0.0,int64


Check describe for train_df


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,891.0,,,,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,,,,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Name,891.0,891.0,"Braund, Mr. Owen Harris",1.0,,,,,,,
Sex,891.0,2.0,male,577.0,,,,,,,
Age,714.0,,,,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,,,,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Ticket,891.0,681.0,347082,7.0,,,,,,,
Fare,891.0,,,,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


Check describe for test_df


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,418.0,,,,1100.5,120.810458,892.0,996.25,1100.5,1204.75,1309.0
Pclass,418.0,,,,2.26555,0.841838,1.0,1.0,3.0,3.0,3.0
Name,418.0,418.0,"Kelly, Mr. James",1.0,,,,,,,
Sex,418.0,2.0,male,266.0,,,,,,,
Age,332.0,,,,30.27259,14.181209,0.17,21.0,27.0,39.0,76.0
SibSp,418.0,,,,0.447368,0.89676,0.0,0.0,0.0,1.0,8.0
Parch,418.0,,,,0.392344,0.981429,0.0,0.0,0.0,0.0,9.0
Ticket,418.0,363.0,PC 17608,5.0,,,,,,,
Fare,417.0,,,,35.627188,55.907576,0.0,7.8958,14.4542,31.5,512.3292
Cabin,91.0,76.0,B57 B59 B63 B66,3.0,,,,,,,


Check describe for combined_df


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,1309.0,,,,655.0,378.020061,1.0,328.0,655.0,982.0,1309.0
Pclass,1309.0,,,,2.294882,0.837836,1.0,2.0,3.0,3.0,3.0
Name,1309.0,1307.0,"Connolly, Miss. Kate",2.0,,,,,,,
Sex,1309.0,2.0,male,843.0,,,,,,,
Age,1046.0,,,,29.881138,14.413493,0.17,21.0,28.0,39.0,80.0
SibSp,1309.0,,,,0.498854,1.041658,0.0,0.0,0.0,1.0,8.0
Parch,1309.0,,,,0.385027,0.86556,0.0,0.0,0.0,0.0,9.0
Ticket,1309.0,929.0,CA. 2343,11.0,,,,,,,
Fare,1308.0,,,,33.295479,51.758668,0.0,7.8958,14.4542,31.275,512.3292
Cabin,295.0,186.0,C23 C25 C27,6.0,,,,,,,


Check for dataype mismatch between Train and Test.If True then no mismatch: (True, [])
Check for duplicates in train_df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


Check for duplicates in test_df


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


Check for duplicates in combined_df


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
