In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
from scipy.stats import probplot
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import os
import sys

%matplotlib inline

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows', None)

### Standardize column names

In [None]:
#Function to standardize column names in a dataframe
def cols_name(df, max_cols_letters = 15):
    """Function for standardizing and shortening the columns names of a dataframe df. /n
    Two arguments needed: dataframe and maximum letters for the columns names. /n
    Replaces spaces with underscore and makes all lowercase, shortens the columns name to max_clos_letters /n
    e.g Customer Lifetime value --> customer_lif"""
    cols = df.columns
    new_column_names =[]
    for col in cols:
        new_col = col.lower().replace (" ", "_")
        new_col = (new_col[:max_cols_letters]) if len(new_col) > max_cols_letters else new_col
        new_column_names.append(new_col)
    df.columns = new_column_names
    return df

#memory usage deep tolist var and nulls

### Display  missing values

In [None]:
#Function to display missing values
def display_missing(df):
    """shows the number of missing values of each column for a date_frame, if there is one"""
    for col in df.columns.tolist():
        if df[col].isnull().sum():
            print('{} column missing values: {}/{}'.format(col, df[col].isnull().sum(), len(df)))
    print ('Done checking for missing values')

### Display datatypes of df, lists them in df

In [None]:
#Function to display data type in a dataframe
def cols_dtypes(df):
    """retrns data types of columns of a dataframe in form of a dataframe, the values beeing the feature/variable names"""
    categoricals = df.select_dtypes(include='object').columns
    numerics = df.select_dtypes(exclude='object').columns
    booleans = df.select_dtypes(include='bool').columns
    floats = df.select_dtypes(include='float').columns
    integers = df.select_dtypes(include='int').columns
    data_types = pd.DataFrame([categoricals,numerics, booleans, floats, integers])
    data_types = data_types.T
    data_types.columns=['catergoricals or mixed', 'nuermicals', 'booleans','floats', 'integers']
    return data_types

### Log transformation and boxcox for a feature, plots 3 distplot next to each other

In [None]:
from scipy import stats
def feat_log_box(df, feature, bins = 100):
    """makes a log Transformation and a boxcox transformation of a feature of a dataframe. \n
    0 values are replaced with the mean in the transformation: \n
    plots 3 distplots next to each other for comparison \n
    tales 3 arguments (df, feature, bins)"""
    df[feature+'_log'] = list(map(lambda x: np.log(x) if np.isfinite(x) and x!=0 else np.NAN, df[feature]))
    df[feature+'_log'] = df[feature+'_log'].fillna(np.mean(df[feature+'_log']))
    df[feature+'_boxcox'] = np.where(df[feature]<=0,0,df[feature])
    mean = np.sum(df[feature+'_boxcox'])/len(df[df[feature+'_boxcox']>0])
    df[feature+'_boxcox'] = df[feature+'_boxcox'].replace(0,mean)
    xt, lmbda = stats.boxcox(df[feature+'_boxcox'])
    df[feature+'_boxcox'] = xt
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize = (15,4))
    sns.distplot(df[feature], bins, ax=ax1)
    ax1.set_title(feature)
    sns.distplot(df[feature+'_log'], bins, ax=ax2)
    ax2.set_title(feature + '_log')
    sns.distplot(xt, bins, ax=ax3)
    ax3.set_title(feature + '_boxcox')
    return df


### eta for correlation among categoricals

In [None]:
#calculation of eta
#https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9

def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = numerator/denominator
    return eta

### Calculate cramers_v, phi2

In [None]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

### model validation (needs dic with columnnames as keys and predict as values)

In [None]:
import sklearn.metrics as metrics

def regression_results(dic_of_models, X_test, y_true):
    '''returns a dataframe with 'Model', 'explained_variance', 'r2', 'adjusted_r2', 'MAE', 'MSE', 'RMSE', 'MSLE',
    need the dictionary of key='modelname', value=prediction, X_test, y_test as attributes'''
    model_val = pd.DataFrame(columns =['Model', 'explained_variance', 'r2', 'adjusted_r2', 'MAE', 'MSE', 'RMSE', 'MSLE'])
    # Regression metrics
    for key, y_pred in dic_of_models.items():
        explained_variance=metrics.explained_variance_score(y_true, y_pred)
        mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
        mse=metrics.mean_squared_error(y_true, y_pred) 
        mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
        median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
        r2=metrics.r2_score(y_true, y_pred)
        n = len(X_test)
        p = X_test.shape[1]
        adj_r2 = 1-((1-r2)*(n-1)/(n-p-1))
    
        val_list = [key, 
                explained_variance,
                round(r2,4),
                round(adj_r2,4),
                round(mean_absolute_error,4),
                round(mse,4),
                round(np.sqrt(mse),4),
                round(mean_squared_log_error,4)]
        v_series = pd.Series(val_list, index = model_val.columns)
        model_val = model_val.append(v_series, ignore_index=True)
    return model_val

#model_dic = {'lm1': predictions, 'lm2':predictions_2, 'knn_model_manhattan':knn_predict_manh}
#validation=regression_results(model_dic, X_test, y_test)


### converter ('array, 'list', 'dataframe', )

In [None]:
def convert(data, to):
    converted = None
    if to == 'array':
        if isinstance(data, np.ndarray):
            converted = data
        elif isinstance(data, pd.Series):
            converted = data.values
        elif isinstance(data, list):
            converted = np.array(data)
        elif isinstance(data, pd.DataFrame):
            converted = data.as_matrix()
    elif to == 'list':
        if isinstance(data, list):
            converted = data
        elif isinstance(data, pd.Series):
            converted = data.values.tolist()
        elif isinstance(data, np.ndarray):
            converted = data.tolist()
    elif to == 'dataframe':
        if isinstance(data, pd.DataFrame):
            converted = data
        elif isinstance(data, np.ndarray):
            converted = pd.DataFrame(data)
    else:
        raise ValueError("Unknown data conversion: {}".format(to))
    if converted is None:
        raise TypeError('cannot handle data conversion of type: {} to {}'.format(type(data),to))
    else:
        return converted

### Correlation ratio

In [None]:
#this function I did get from kaggle, they are needed to do the categorial correlation using cramers
#https://stackoverflow.com/questions/46498455/categorical-features-correlation/46498792#46498792

def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = numerator/denominator
    return eta

### Associations for corr. matrix of categrocials and cat with num

In [None]:
#this function I did get from kaggle, they are needed to do the categorial correlation using cramers
#https://stackoverflow.com/questions/46498455/categorical-features-correlation/46498792#46498792

def associations(dataset, nominal_columns=None, mark_columns=False, theil_u=False, plot=True,
                          return_results = False, **kwargs):
    """
    Calculate the correlation/strength-of-association of features in data-set with both categorical (eda_tools) and
    continuous features using:
     - Pearson's R for continuous-continuous cases
     - Correlation Ratio for categorical-continuous cases
     - Cramer's V or Theil's U for categorical-categorical cases
    :param dataset: NumPy ndarray / Pandas DataFrame
        The data-set for which the features' correlation is computed
    :param nominal_columns: string / list / NumPy ndarray
        Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all
        columns are categorical, or None (default) to state none are categorical
    :param mark_columns: Boolean (default: False)
        if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or
        continuous), as provided by nominal_columns
    :param theil_u: Boolean (default: False)
        In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V
    :param plot: Boolean (default: True)
        If True, plot a heat-map of the correlation matrix
    :param return_results: Boolean (default: False)
        If True, the function will return a Pandas DataFrame of the computed associations
    :param kwargs:
        Arguments to be passed to used function and methods
    :return: Pandas DataFrame
        A DataFrame of the correlation/strength-of-association between all features
    """
    dataset = convert(dataset, 'dataframe')
    columns = dataset.columns
    if nominal_columns is None:
        nominal_columns = list()
    elif nominal_columns == 'all':
        nominal_columns = columns
    corr = pd.DataFrame(index=columns, columns=columns)
    for i in range(0,len(columns)):
        for j in range(i,len(columns)):
            if i == j:
                corr[columns[i]][columns[j]] = 1.0
            else:
                if columns[i] in nominal_columns:
                    if columns[j] in nominal_columns:
                        if theil_u:
                            corr[columns[j]][columns[i]] = theils_u(dataset[columns[i]],dataset[columns[j]])
                            corr[columns[i]][columns[j]] = theils_u(dataset[columns[j]],dataset[columns[i]])
                        else:
                            cell = cramers_v(dataset[columns[i]],dataset[columns[j]])
                            corr[columns[i]][columns[j]] = cell
                            corr[columns[j]][columns[i]] = cell
                    else:
                        cell = correlation_ratio(dataset[columns[i]], dataset[columns[j]])
                        corr[columns[i]][columns[j]] = cell
                        corr[columns[j]][columns[i]] = cell
                else:
                    if columns[j] in nominal_columns:
                        cell = correlation_ratio(dataset[columns[j]], dataset[columns[i]])
                        corr[columns[i]][columns[j]] = cell
                        corr[columns[j]][columns[i]] = cell
                    else:
                        cell, _ = ss.pearsonr(dataset[columns[i]], dataset[columns[j]])
                        corr[columns[i]][columns[j]] = cell
                        corr[columns[j]][columns[i]] = cell
    corr.fillna(value=np.nan, inplace=True)
    if mark_columns:
        marked_columns = ['{} (nom)'.format(col) if col in nominal_columns else '{} (con)'.format(col) for col in columns]
        corr.columns = marked_columns
        corr.index = marked_columns
    if plot:
        plt.figure(figsize=(10,10))#kwargs.get('figsize',None))
        sns.heatmap(corr, annot=kwargs.get('annot',True), vmin=-1, vmax=1, fmt=kwargs.get('fmt','.2f'), cmap='seismic')
        plt.show()
    if return_results:
        return corr

### detecting outliers

In [None]:
#from flo, not really tested
def outliers(column, threshold = 3): #define a funciton like this, so there is a predefault value
    '''labels oultiers, dedect outliers according to mean and standartddeviation 
    and a defined threshld which is the nuer of standarddeviations,taht are the outliers, outlier has to be removed still'''
    return column[abs(column.apply(lambda x: (x-column.mean())/column.var() **(1/2))) >threshold] 

#CLV_outliers = outliers(nums['customer_lifetime_va'], 3)
#MPA_outliers = outliers(nums['monthly_prem...'], 3)
#to_drop = CLV_outliers.index | MPA_outliers.index
#nums = nums.drop(to_drop)

#own function, tested, learned that return column of dtatfreaem, then saz df['c'] = fungiyon()'
def remove_outliers(df, feature, factor):
    iqr = np.percentile(df[feature],75) - np.percentile(df[feature],25)
    upper_limit = np.percentile(df[feature],75) + factor*iqr
    lower_limit = np.percentile(df[feature],25) - factor*iqr
    df[feature]=df[(df[feature]>lower_limit) & (df[feature]<upper_limit)]
    return df[feature]

#nums['customer_lifetime_va'] = remove_outliers(nums, 'customer_lifetime_va', 2.5)
