In [50]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [44]:
def read_file(file_path):
    if file_path.endswith('.csv'):
        df=pd.read_csv(file_path)
        return df
    elif file_path.endswith('.xlsx'):
        df=pd.read_excel(file_path)
        return df
    else:
        print('please enter csv or excel file.')
   

In [51]:
def data_info(df):
    print("-" * 60)
    print("The shape of data:")
    print('\n')
    print(df.shape)
    print('\n')
    print('-' * 60)
    
    print("Data Overview:")
    print('\n')
    print(df.head())
    print('\n')
    print('-' * 60)
    
    print("Data Types:")
    print('\n')
    print(df.dtypes)
    print('\n')
    print("-" * 60)
   
    print("Columns Name:")
    print('\n')
    print(df.columns)
    print('\n')
    print('-' * 60)
    
    print("Data Info:")
    print('\n')
    print(df.info())
    print('\n')
    print('-' * 60)
    
    print("Columns with Missing Values:")
    print('\n')
    print(df.isna().sum())
    print('\n')
    print("-" * 60)
    
    print("-" * 30,"The Description of data:","-" * 30)
    print('\n')
    print("The Description of numerical data:")
    print('\n')
    print(df.describe().round(3))
    print('\n')
    print("-" * 60)
    
    print("The Description of catogerical data:")
    print('\n')
    print(df.describe(include=['object']))
    print('\n')
    print("-" * 60)
    cat_data=df.select_dtypes(include=['object']).columns
    for col in cat_data:
            print(f"'{col}':")
            print("Number of Unique Categories:",df[col].nunique())
    print('\n')
    print("-" * 60)
    print('\n')

In [52]:
def data_preprocessing(df):
    num_features = df.select_dtypes(include=['int64', 'float64']).columns
    cat_features=df.select_dtypes(include=['object']).columns
    for col in df:
        if col in num_features:
            if df[col].isna().sum()> df.shape[0]/5:
                df.drop(col,axis=1,inplace=True)
            elif  df[col].isna().sum()==0:
                    continue
            else:
                df[col].fillna(df[col].mean(), inplace=True)
        elif col in cat_features:
            mode_cat = df[col].mode()[0] 
            df[col].fillna(value=mode_cat, inplace=True)
    return df

In [None]:
def data_visualization(df):
    #Frequency distribution of the data for each feature:
    df[df.columns].hist(figsize=(10,10),edgecolor='black',linewidth=2)
    plt.title('Frequency distribution for each feature')
    plt.show()
     #Box plot for each feature:
    plt.figure(figsize=(10, 8))
    sns.boxplot(data=df)
    plt.title('Box plot for each column')
    plt.xticks(rotation=90)
    plt.show()
    #Linear correlation between the columns with each other:
    plt.figure(figsize = (10,6))
    sns.heatmap(df.corr(), annot=True)
    plt.show()

In [None]:
def visualize_column(df, chosen_column):
    col_data = df[chosen_column]
    if col_data.dtype == 'int64' or col_data.dtype == 'float64':
        plt.figure(figsize=(7,5))
        sns.histplot(data=df, x=chosen_column,bins=50, kde=True)
        plt.title(f'Histogram of {chosen_column}')
        plt.xlabel(chosen_column)
        plt.ylabel('Frequency')
        plt.show()
    elif col_data.dtype == 'object':
        plt.figure(figsize=(7, 5))
        sns.countplot(data=df,x=chosen_column,order=col_data.value_counts().index,palette='Set1')
        plt.title(f'Frequency of {chosen_column}')
        plt.xlabel(chosen_column)
        plt.ylabel('Frequency')
        plt.xticks(rotation=90)
        plt.show()

In [1]:
def main():
    file_path=input('please enter your file path: ')
    df=read_file(file_path)
    data_info(df)
    data_preprocessing(df)
    data_visualization(df)
    while(True):
        col=input("please enter column name to visualise or enter E for exit: ")
        while(col!='E'):
            try:
                print(df[col])
            except:
                col= input("Please enter a valid column name: ")
            else:
                break
        if(col!='E'):
            visualize_column(df,col)
        else:   
            sys.exit()
    

In [None]:
main()