In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import missingno as msgno
import sklearn as skl
import tensorflow as tf
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from math import isnan
import re
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import SVC
import operator
#get_truth(1.0, operator.gt, 0.0)
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import plot_tree
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from tensorflow import keras
from tensorflow.keras import layers


In [None]:
def plot_countplots(data, cat_cols, dpi, figsize, low, hi, plot=True, tplot = sns.countplot, ord=[], hue=None):
    count = 0
    if(plot):
        for cat in cat_cols:
            if(data[cat].unique().size >= low and data[cat].unique().size < hi):
                plt.figure(dpi= dpi, figsize=figsize)
                tplot(data= data, x = cat, hue=hue)
                plt.xticks(rotation = -90)
                plt.show()
                count += 1
    else:
        for cat in cat_cols:
            if(data[cat].unique().size >= low and data[cat].unique().size < hi):
                plt.figure(dpi= 150, figsize=figsize)
                if(np.array_equal(ord, [])): 
                    order = data.groupby([cat]).median()["ConvertedCompYearly"].sort_values().index
                else:
                    order = ord
                    
                sns.boxplot(data= data, x="ConvertedCompYearly", y=cat, order=order)
                plt.xticks(rotation = -90)
                plt.show()
                count += 1
        
    print(count)

In [None]:
def cut_by_quantile(data, qle, col, fun = lambda x : True):
    
    bool1 = (data["ConvertedCompYearly"] <= data["ConvertedCompYearly"].quantile(qle)) | \
            (data["ConvertedCompYearly"] != data["ConvertedCompYearly"])
    bool2 = data[col].apply(fun)
    return data[(bool1) & (bool2)]

In [None]:
def too_little(data, col, little, rel = operator.gt):
    return data[col].value_counts()[rel(data[col].value_counts(), little)].index

In [None]:
def in_too_little(y, rel=operator.truth):
  return lambda x : rel(x in y)

In [None]:
def convert_col(data, col):
  data[col] = data[col].astype('category').cat.codes

In [None]:
def convert_cols(data, cols):
    [convert_col(data, col) for col in cols]

In [None]:
def set_labels(plot, data, label, figsize, rotation, labelsize):
    plot(data, labels=label, figsize=figsize)
    plt.xticks(rotation = rotation, horizontalalignment="center")
    plt.tick_params(labelsize=labelsize)
    plt.show()

In [None]:
def plot_distribution(data, colmn, dpi):
    p = sns.displot(data = data, x = colmn, kind="kde").fig.set_dpi(dpi)
    plt.show(p)

In [None]:
def salary_to_cat(salary_array, salary):
    for i in range(len(salary_array)):
        if(salary <= salary_array[i]):
            return i

In [None]:
def strTonum(x):
    if(x != x):
        return x
        
    return int(re.findall(r'\d+', x)[0])

In [None]:
operator.eq(np.nan, np.nan)

In [None]:
def set_median(data, cols, num, rel):

    i = 0
    for col in cols:
        data[col] = data[col].apply(lambda x : data[col].median() if rel[i](x, num[i]) else x)
        i += 1

In [None]:
def composition(f, g):
    return lambda x,y : f(g(x,y))

In [None]:
def drop_cols(trainX, trainY, testX, testY, cols):

    for col in cols:

        to_drop = trainX[col].index[trainX[col].isna()]
        trainX.drop(to_drop, axis=0, inplace=True), trainY.drop(to_drop, axis=0, inplace=True)

    
    for col in cols:
        to_drop = testX[col].index[testX[col].isna()]
        
        testX.drop(to_drop, axis=0, inplace=True), testY.drop(to_drop, axis=0, inplace=True)
    
    
        

    to_drop_y_test = testY.index[testY.isna()]
    to_drop_y = trainY.index[trainY.isna()]
    trainX.drop(to_drop_y, axis=0, inplace=True), trainY.drop(to_drop_y, axis=0, inplace=True)
    testX.drop(to_drop_y_test, axis=0, inplace=True), testY.drop(to_drop_y_test, axis=0, inplace=True)


In [None]:
def dropminus_na(to_drop_df, to_drop_df_test, X_train, y_train, X_test, y_test):

    X_train.drop(to_drop_df.index[to_drop_df == True], axis=0), y_train.drop(to_drop_df.index[to_drop_df == True], axis=0)
    X_test.drop(to_drop_df_test.index[to_drop_df_test == True], axis=0), y_test.drop(to_drop_df_test.index[to_drop_df_test == True], axis=0)
    


In [None]:
def combine_data(num, x):

    if(num == 0):
        if(x[0] == 'nan'):
            return np.nan
        else:
            return x[0]
    
    for i in range(1, num + 1):
        if(len(x) == i):
            return np.nan
    
    
    return x[num]

In [None]:
def construct_columns(data):
    values = set()
    for i in range(len(data)):
        values.update(data[i])

    return values

In [None]:
def put_cols(data, cols, array):

    for col in cols:
        if(pd.isnull(col)):
            data["nan"] = array.apply(lambda x : "nan" in x)
            continue
        data[col] = array.apply(lambda x : col in x)


In [2]:
def organize_training(X_train, y_train, X_test, y_test, cols, nums, rels, to_drop_df, to_drop_df_test):
    
    set_median(X_train, cols, nums, rels)
    set_median(X_test, cols, nums, rels)
    drop_cols(X_train, y_train, X_test, y_test, ["EdLevel"])

    dropminus_na(to_drop_df, to_drop_df_test, X_train, y_train, X_test, y_test)


In [None]:
eddict = {'Bachelor’s degree (B.A., B.S., B.Eng., etc.)' : 4,'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)' : 5, 
'Primary/elementary school' : 0, 
'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)' : 1,
'Some college/university study without earning a degree' : 3, 'Associate degree (A.A., A.S., etc.)' : 2,
'Professional degree (JD, MD, etc.)' : 5, 'Other doctoral degree (Ph.D., Ed.D., etc.)' : 6,
'Something else' : 7
}