# A file for useful functions

In [None]:
# Live-reload of imported modules
%load_ext autoreload
%autoreload 2

import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from math import pi
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline 

# Print all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
import os

IS_LOCAL = True
if(IS_LOCAL):
    PATH="../input/folder"
else:
    PATH="../input"
print(os.listdir(PATH))

# Eg read data
train_df = pd.read_csv(PATH+"/train.csv")

Source: https://github.com/gabrielpreda/Kaggle/blob/master/HelpNavigateRobots/robots-need-help.ipynb

## Check missing data in columns

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

def check_sparsity(df):
    non_zeros = (df.ne(0).sum(axis=1)).sum()
    total = df.shape[1]*df.shape[0]
    zeros = total - non_zeros
    sparsity = round(zeros / total * 100,2)
    density = round(non_zeros / total * 100,2)

    print(" Total:",total,"\n Zeros:", zeros, "\n Sparsity [%]: ", sparsity, "\n Density [%]: ", density)
    return density

## Filling missing data

In [None]:
X_train.fillna(0, inplace = True)
X_train.replace(-np.inf, 0, inplace = True)
X_train.replace(np.inf, 0, inplace = True)
X_test.fillna(0, inplace = True)
X_test.replace(-np.inf, 0, inplace = True)
X_test.replace(np.inf, 0, inplace = True)

## Checking distribution of labels of each class for classification problems

In [None]:
f, ax = plt.subplots(1,1, figsize=(16,4))
g = sns.countplot(train['<y_column>'])
g.set_title("Number of labels for each class")
plt.show()   

# Checking percent of "1" labelled data points:
# print("There are {}% target values with 1".format(100 * train["target"].value_counts()[1]/train.shape[0]))

## Check variation in feature ditribution of train and test data (Continuous features)

In [None]:
def plot_feature_distribution(df1, df2, label1, label2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(2,5,figsize=(16,8))

    for feature in features:
        i += 1
        plt.subplot(2,5,i)
        sns.kdeplot(df1[feature], bw=0.5,label=label1)
        sns.kdeplot(df2[feature], bw=0.5,label=label2)
        plt.xlabel(feature, fontsize=9)
        locs, labels = plt.xticks()
        plt.tick_params(axis='x', which='major', labelsize=8)
        plt.tick_params(axis='y', which='major', labelsize=8)
    plt.show();

In [None]:
# Check distribution of mean values, could also change to min, max, std etc.

plt.figure(figsize=(16,6))
features = train_df.columns.values[2:302]
plt.title("Distribution of mean values per row in the train and test set")
sns.distplot(train_df[features].mean(axis=1),color="green", kde=True,bins=120, label='train') # or axis=0 for per column distribution
sns.distplot(test_df[features].mean(axis=1),color="blue", kde=True,bins=120, label='test')
plt.legend()
plt.show()

In [None]:
# Check ditbn of skewness/kurtosis

t0 = train_df.loc[train_df['target'] == 0]
t1 = train_df.loc[train_df['target'] == 1]
plt.figure(figsize=(16,6))
plt.title("Distribution of skew values per row in the train set")
sns.distplot(t0[features].skew(axis=1),color="red", kde=True,bins=120, label='target = 0') # or axis=0 for per column distribution
sns.distplot(t1[features].skew(axis=1),color="blue", kde=True,bins=120, label='target = 1')
plt.legend(); plt.show()

## Plot feature-wise class distribution

In [None]:
def plot_feature_class_distribution(classes,tt, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(5,2,figsize=(16,24))

    for feature in features:
        i += 1
        plt.subplot(5,2,i)
        for clas in classes:
            ttc = tt[tt['<y_column>']==clas]
            sns.kdeplot(ttc[feature], bw=0.5,label=clas)
        plt.xlabel(feature, fontsize=9)
        locs, labels = plt.xticks()
        plt.tick_params(axis='x', which='major', labelsize=8)
        plt.tick_params(axis='y', which='major', labelsize=8)
    plt.show();

classes = (train['<y_column>'].value_counts()).index

In [None]:
sns.distplot(train['<target_variable>'])

In [None]:
data.describe(percentiles=[.25, .5, .75, .90, .95, .99])

Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
print(IQR)