In [1]:
def remove_outliers(data):
    return data[(data['1st Flr SF'] < 3000) &
               ((data)['Gr Liv Area'] < 3000)]

In [15]:
# Thanks Will Badr for this! https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779
def imp_data(data):
    imp_mean = SimpleImputer(strategy = 'mean')
    imp_mode = SimpleImputer(strategy = 'most_frequent')
    has_nulls = data.isnull().mean() != 0
    null_columns = data.columns[has_nulls]
    for column in null_columns:
        try:
            train = data[[column]]
            imp_mean.fit(train)
            data[column] = imp_mean.transform(train)
        except:
            train = data[[column]]
            imp_mode.fit(train)
            data[column] = imp_mode.transform(train)

In [21]:
def category_to_bool_cols(dataframe, list_of_columns):
    for column in list_of_columns:
        dummy_split = pd.get_dummies(dataframe[column], column, drop_first = True) # Creates dummy columns with the name {column}_{value_in_row} per get_dummies documentation
        for dummy_key in dummy_split: # Iterates through dummy_key in dummy_split
            dataframe[dummy_key] = dummy_split[dummy_key] # adds new columns named {dummy_key} to original dataframe

In [195]:
def log_col(data, columns):
    for column in columns:
        temp_df = data[column].apply(change_0_to_1)
        data[f"log_{column.replace(' ', '_').lower()}"] = np.log(temp_df)

In [196]:
def log_hist(data, column):
    plt.hist(data[column].apply(change_0_to_1))

In [203]:
def random_feature_thresh_test(data, target, features, threshold_start):
    best_threshold = 0
    best_score = float('inf')
    for i in range(0, 100):
        mean_corr = data.corr()[target].mean()
        feature_threshold = threshold_start + (i / 100)
        abs_value_greater_than_thresh = abs(data.corr()['SalePrice']) > mean_corr * feature_threshold
        # EdChum and dartdog from SO: https://stackoverflow.com/questions/29281815/pandas-select-dataframe-columns-using-boolean
        strong_corr_features = data[data.corr().columns[abs_value_greater_than_thresh]]

        features = list(strong_corr_features[1:])
        features_not_in_list = ['SalePrice', 'PID', 'Id'
                               ]
        features = [feature for feature in features if feature not in features_not_in_list]

        X = data[features]
        y = data[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=342)

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        y_pred = lr.predict(X_test)

        lr.score(X_test, y_test)
        score = metrics.mean_squared_error(y_test, y_pred, squared=False)
        if score < best_score:
            print(score)
            print("^^NEW HIGH SCORE^^")
            best_score = score
            best_threshold = feature_threshold
    return f'The best score was {best_score}, the best threshold was {best_threshold}.'
    

In [210]:
def get_features(data, threshold, mean_corr):

    abs_value_greater_than_thresh = abs(data.corr()['SalePrice']) > mean_corr * threshold
    # EdChum and dartdog from SO: https://stackoverflow.com/questions/29281815/pandas-select-dataframe-columns-using-boolean
    strong_corr_features = data[data.corr().columns[abs_value_greater_than_thresh]]

    features = list(strong_corr_features[1:])
    features_not_in_list = ['SalePrice', 'PID', 'Id'
                           ]
    try:
        return [feature for feature in features if feature not in features_not_in_list]
    except:
        features_not_in_list = ['PID', 'Id'
                       ]
        return [feature for feature in features if feature not in features_not_in_list]

In [366]:
def get_cval_score_mse(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=342)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    resids = y_test - y_pred
    print(f'The Cross Validation Score is: {cross_val_score(lr, X_train, y_train)}')
    print(f'The R2 score on testing data is: {lr.score(X_test, y_test)}')
    print(f'The MSE is {metrics.mean_squared_error(y_test, y_pred, squared=False)}')
    return X_train, X_test, y_train, y_test
    
    