In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import zipfile
import zlib
import time

**EDA Functions**

In [8]:
def findMissing(aSeries):
    """
    Gives total, non-missing, missing & percentage of missing observations in a pandas series.
    
    Parameters
    ----------
    aSeries : Pandas Series
    
    Returns
    -------
    missingList : List with 1st element denotes total number of observations in a series.
                  2nd element denotes total non-missing observations.
                  3rd element denotes total missing observations.
                  4th element denotes percentage of missing observations.
    """
    missingObs = aSeries.isnull().sum()
    nonMissingObs = aSeries.count()
    totalObs = len(aSeries)
    percentageMissing = round((float(missingObs) / float(totalObs) * 100),2)
    missingList = [totalObs,nonMissingObs,missingObs,percentageMissing]
    return missingList

In [1]:
def total_days_till_today(last_date):
    '''
    
    '''
    return (pd.to_datetime('today') - last_date).days

In [2]:
def wavg(sub1, sub2, weights, id_col, outcome_col):
    '''
    
    '''
    merged_sub = sub1.merge(sub2, on=[id_col], how='left', suffixes=['_sub1','_sub2'])
    merged_sub[outcome_col] = (weights[0] * merged_sub[outcome_col+'_sub1'] + weights[1] * merged_sub[outcome_col+'_sub2'])
    return merged_sub[[id_col,outcome_col]]

In [None]:
def convert_dates(date):
    '''
    Converts date of type 01-Jan-17 into datetime object.
    '''
    return datetime.strptime(date, '%d-%b-%y')

In [None]:
def read_train_test(dirPath):
    '''
    Reads train & test zip files.
    Function specific to reading data of Kaggle Competitions.
    
    Parameters
    ----------
    dirPath: string. Directory Path which has train, test, sample_submission files.
    
    Returns
    -------
    train: pandas dataframe. Train loaded into pd df.
    test: pandas dataframe. Test loaded into pd df.
    sample_submission: pandas dataframe. sample_submission loaded into pd df.
    '''
    train = pd.read_csv(os.path.join(dirPath, 'train.csv.zip'))
    test = pd.read_csv(os.path.join(dirPath, 'test.csv.zip'))
    sample_submission = pd.read_csv(os.path.join(dirPath, 'sample_submission.csv.zip'))
    return train, test, sample_submission

In [None]:
def create_combined_df(train, test):
    '''
    Creates combined dataframe from train & test dataframes.
    Basically concats train & test dataframes.
    
    Parameters
    -----------
    train : a pandas DataFrame.
    test : a pandas DataFrame.
    
    Returns
    --------
    combined_df : concated pandas dataframe.
    '''
    train['train_or_test'] = 'train'
    test['train_or_test'] = 'test'
    combined_df = pd.concat([train,test])
    return combined_df

In [None]:
def time_elapsed(t0):
    return (time.time()-t0)/60

**Visualization Functions**

In [9]:
def countplot(feat,df,figsize,rot):
    '''
    Creates a countplot of seaborn library with given specifications about the plot.
    
    Parameters
    ----------
    feat: string. Feature name whose countplot is to be generated.
    df: Pandas dataframe. 
    figsize: a tuple. Figure size (Breadth, Length). Like (15,5)
    rot: string. Rotation of xticks in plot. Only takes 'horizontal' or 'vertical'
    
    Returns
    -------
    None   
    '''
    plt.figure(figsize=figsize)
    plt.xticks(rotation=rot)
    sns.countplot(x=feat,data=df)

**Other Functions**

In [10]:
def create_zip(path,fname,zipfname):
    '''
    Converts a file into zip file.
    
    Parameters
    ----------
    path: string. Path where file is located. This is the path of both file & zipped file.
    fname: string. Name of file.
    zipfname: string. Name that the zipped file should have.
    
    Returns
    -------
    None
    '''
    os.chdir(path)
    print 'creating archive'
    zf = zipfile.ZipFile(zipfname,mode='w')
    try:
        zf.write(fname, compress_type=zipfile.ZIP_DEFLATED)
    finally:
        zf.close()
    print 'Completed creating archive'

In [11]:
def extract_from_zip(inputZipFilePath,outputDir):
    '''
    Extracts a file from its zip file.
    
    Parameters
    ----------
    inputZipFilePath: string. Whole path with file name of the zipped file.
    outputDir: string. Path of the directory in which the zip file should be unzipped.
    
    Returns
    -------
    None
    '''
    zip_ref = zipfile.ZipFile(inputZipFilePath,'r')
    zip_ref.extractall(outputDir)
    zip_ref.close()