In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc,precision_recall_curve, accuracy_score, \
recall_score, precision_score, confusion_matrix
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold
from sklearn.ensemble import VotingRegressor, VotingClassifier, StackingRegressor, StackingClassifier, GradientBoostingRegressor,GradientBoostingClassifier, BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
import itertools as it
import time as time
import xgboost as xgb
from pyearth import Earth
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score

In [2]:
data = pd.read_csv('OnlineNewsPopularity/OnlineNewsPopularity.csv')
data.columns = [x.strip() for x in data.columns.tolist()]
X = data.drop(columns = ['url', 'shares', 'timedelta'])
y = data['shares']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 45)
# Total variables I dropped
total_dropped = 0

In [3]:
# Using Linear Regression as a base model to judge error 
base_model = LinearRegression().fit(X_train, y_train)
pred = base_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, pred))

7427.725425387819

In [4]:
# Dropping any variables with zero variance
dropped1 = X_train.var(numeric_only = True)[X_train.var(numeric_only = True) == 0].index.tolist()
# There are none with zero variance
len(dropped1)

0

In [5]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
# Will only remove variables until base difference in RMSE is less than 2 (as seen in a later coding session, the RMSE does not 
# change that quickly, so only allowing RMSE difference of 2 is to prevent removal of important predictors)
to_drop = ['weekday_is_monday', 'weekday_is_saturday', 'LDA_00', 'n_unique_tokens', 'n_non_stop_words', 'self_reference_avg_sharess',
          'rate_positive_words']
# The variables right now result in the lowest rmse we can get, all other variable removals increase the rmse
X = add_constant(X_train.drop(columns = to_drop))
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

for i in range(len(X.columns)):
    vif_data.loc[i,'VIF'] = variance_inflation_factor(X.values, i)   

total_dropped += len(to_drop)
# Total variables dropped due to multicollinearity is 9
base_model = LinearRegression().fit(X_train.drop(columns = to_drop), y_train)
pred = base_model.predict(X_test.drop(columns = to_drop))
print('base rmse = {}'.format(7427.7))
print('new_model rmse = {}'.format(np.sqrt(mean_squared_error(y_test, pred))))
print('diff rmse = {}'.format(np.sqrt(mean_squared_error(y_test, pred)) - 7427.7))

base rmse = 7427.7
new_model rmse = 7422.336120761044
diff rmse = -5.363879238955633


In [6]:
# Since negligible difference, remove the predictors entirely
X_train.drop(columns = to_drop, inplace = True)
X_test.drop(columns = to_drop, inplace = True)
all_dropped_predictors = []
for x in to_drop:
    all_dropped_predictors.append(x)

In [7]:
# Now for the key part, removing variables with high level of correlation with one another
# Using coarse grid for correlation to reject between variables
correlation_range = [(x / 10) for x in range(0, 11, 1)]
rmses = pd.DataFrame()

iteration = 0
for correlation in correlation_range:
    actual_columns_drop = []
    columns_to_drop = []
    X_train_copy = X_train.copy()
    while (True):
        try:
            # Systematically remove variables one by one from dataset copy based on correlation, and break when no more variables to remove
            correlations = X_train_copy.corr(numeric_only = True)
            first_column = correlations.iloc[0].name
            index = correlations[first_column].apply(lambda x: False if x == 1 else True if (np.abs(x) > correlation) else False)
            to_drop = correlations[first_column].loc[index].index.tolist()
            columns_to_drop.append(to_drop)
            X_train_copy = X_train_copy.drop(columns = to_drop).drop(columns = first_column)
        except:
            break
            
    # Make column list of lists of columns into one list of columns
    for columns in columns_to_drop:
        for column in columns:
            actual_columns_drop.append(column)
            
    # Create new model based off of new columns to drop, and then add the rmse to rmses
    model = LinearRegression().fit(X_train.drop(columns = actual_columns_drop), y_train)
    pred = model.predict(X_test.drop(columns = actual_columns_drop))
    model_rmse = np.sqrt(mean_squared_error(y_test, pred))
    
    rmses.loc[iteration, 'RMSE'] = model_rmse
    rmses.loc[iteration, 'correlation'] = correlation
    rmses.loc[iteration, 'diff_RMSE'] = model_rmse - 7427.7
    rmses.loc[iteration, 'model'] = model
    rmses.loc[iteration, 'total_dropped'] = len(actual_columns_drop)
    iteration += 1

In [8]:
# based off of the RMSES I'm seeing in the dataframe below, we can see that the RMSE generally doesn't rise all that much
# As the correlation decreases. The differences in the RMSE show, however, that the correlation we shoudl investigate is somewhere
# within thh range of 0.5 - 1.0
rmses

Unnamed: 0,RMSE,correlation,diff_RMSE,model,total_dropped
0,7515.238747,0.0,87.538747,LinearRegression(),50.0
1,7533.848015,0.1,106.148015,LinearRegression(),40.0
2,7528.661475,0.2,100.961475,LinearRegression(),34.0
3,7457.486195,0.3,29.786195,LinearRegression(),24.0
4,7452.685766,0.4,24.985766,LinearRegression(),21.0
5,7424.219176,0.5,-3.480824,LinearRegression(),15.0
6,7438.556972,0.6,10.856972,LinearRegression(),10.0
7,7438.340254,0.7,10.640254,LinearRegression(),10.0
8,7438.773056,0.8,11.073056,LinearRegression(),4.0
9,7421.268474,0.9,-6.431526,LinearRegression(),1.0


In [9]:
# Fine correlation search
# Now for the key part, removing variables with high level of correlation with one another
# Using coarse grid for correlation to reject between variables
correlation_range = [(x / 100) for x in range(50, 102, 2)]
rmses = pd.DataFrame()

columns = []
iteration = 0
for correlation in correlation_range:
    actual_columns_drop = []
    columns_to_drop = []
    X_train_copy = X_train.copy()
    while (True):
        try:
            # Systematically remove variables one by one from dataset copy based on correlation, and break when no more variables to remove
            correlations = X_train_copy.corr(numeric_only = True)
            first_column = correlations.iloc[0].name
            index = correlations[first_column].apply(lambda x: False if x == 1 else True if (np.abs(x) > correlation) else False)
            to_drop = correlations[first_column].loc[index].index.tolist()
            columns_to_drop.append(to_drop)
            X_train_copy = X_train_copy.drop(columns = to_drop).drop(columns = first_column)
        except:
            break
            
    # Make column list of lists of columns into one list of columns
    for columns in columns_to_drop:
        for column in columns:
            actual_columns_drop.append(column)
            
    # Create new model based off of new columns to drop, and then add the rmse to rmses
    model = LinearRegression().fit(X_train.drop(columns = actual_columns_drop), y_train)
    pred = model.predict(X_test.drop(columns = actual_columns_drop))
    model_rmse = np.sqrt(mean_squared_error(y_test, pred))
    
    rmses.loc[iteration, 'RMSE'] = model_rmse
    rmses.loc[iteration, 'correlation'] = correlation
    rmses.loc[iteration, 'diff_RMSE'] = model_rmse - 7427.7
    rmses.loc[iteration, 'model'] = model
    rmses.loc[iteration, 'total_dropped'] = len(actual_columns_drop)
    columns.append(actual_columns_drop)
    iteration += 1

In [10]:
# When looking at the difference in rmse, we can see that the correlation to aim for when removing predictors is either
# 0.5 - 0.58 or 0.84-1.0. However, given that the difference in RMSE spikes and then falls back down again once we reach 15 
# total dropped suggests that there is a variable that is a significant detriment to our model quality that is removed when we 
# go to a correlation of 0.5. Given that the "spike" in rmse is only a difference of 10, if that variable does not end up being that
# significant in reducing RMSe as we thought, it still wouldn't significantly impact out model as we would only be 10 RMSE
# more inaccurate.
rmses

# Therefore we will do one more final fine search in the correlation from 0.4 - 0.6 to find the best correlation to choose

Unnamed: 0,RMSE,correlation,diff_RMSE,model,total_dropped
0,7424.219176,0.5,-3.480824,LinearRegression(),15.0
1,7424.466542,0.52,-3.233458,LinearRegression(),14.0
2,7424.490899,0.54,-3.209101,LinearRegression(),14.0
3,7424.490899,0.56,-3.209101,LinearRegression(),14.0
4,7425.961757,0.58,-1.738243,LinearRegression(),12.0
5,7438.556972,0.6,10.856972,LinearRegression(),10.0
6,7438.556972,0.62,10.856972,LinearRegression(),10.0
7,7438.340254,0.64,10.640254,LinearRegression(),10.0
8,7438.340254,0.66,10.640254,LinearRegression(),10.0
9,7438.340254,0.68,10.640254,LinearRegression(),10.0


In [11]:
# Fine correlation search
# Now for the key part, removing variables with high level of correlation with one another
# Using coarse grid for correlation to reject between variables
correlation_range = [(x / 100) for x in range(40, 60, 1)]
rmses = pd.DataFrame()

new_columns = []
iteration = 0
for correlation in correlation_range:
    actual_columns_drop = []
    columns_to_drop = []
    X_train_copy = X_train.copy()
    while (True):
        try:
            # Systematically remove variables one by one from dataset copy based on correlation, and break when no more variables to remove
            correlations = X_train_copy.corr(numeric_only = True)
            first_column = correlations.iloc[0].name
            index = correlations[first_column].apply(lambda x: False if x == 1 else True if (np.abs(x) > correlation) else False)
            to_drop = correlations[first_column].loc[index].index.tolist()
            columns_to_drop.append(to_drop)
            X_train_copy = X_train_copy.drop(columns = to_drop).drop(columns = first_column)
        except:
            break
            
    # Make column list of lists of columns into one list of columns
    for columns in columns_to_drop:
        for column in columns:
            actual_columns_drop.append(column)
            
    # Create new model based off of new columns to drop, and then add the rmse to rmses
    model = LinearRegression().fit(X_train.drop(columns = actual_columns_drop), y_train)
    pred = model.predict(X_test.drop(columns = actual_columns_drop))
    model_rmse = np.sqrt(mean_squared_error(y_test, pred))
    
    rmses.loc[iteration, 'RMSE'] = model_rmse
    rmses.loc[iteration, 'correlation'] = correlation
    rmses.loc[iteration, 'diff_RMSE'] = model_rmse - 7427.7
    rmses.loc[iteration, 'model'] = model
    rmses.loc[iteration, 'total_dropped'] = len(actual_columns_drop)
    new_columns.append(actual_columns_drop)
    iteration += 1

In [12]:
# We can find that the best correlation to choose to remove variables is 0.5
rmses

Unnamed: 0,RMSE,correlation,diff_RMSE,model,total_dropped
0,7452.685766,0.4,24.985766,LinearRegression(),21.0
1,7452.685766,0.41,24.985766,LinearRegression(),21.0
2,7452.685766,0.42,24.985766,LinearRegression(),21.0
3,7453.72146,0.43,26.02146,LinearRegression(),20.0
4,7453.72146,0.44,26.02146,LinearRegression(),20.0
5,7453.72146,0.45,26.02146,LinearRegression(),20.0
6,7427.344612,0.46,-0.355388,LinearRegression(),19.0
7,7427.345294,0.47,-0.354706,LinearRegression(),18.0
8,7427.424247,0.48,-0.275753,LinearRegression(),17.0
9,7425.260852,0.49,-2.439148,LinearRegression(),16.0


In [13]:
# Final dataset is this one
X_train.drop(columns = new_columns[10], inplace = True)
X_test.drop(columns = new_columns[10], inplace = True)

In [14]:
# We have effectively almost cut the total predictors in half
len(X_train.columns.tolist())

36

In [15]:
for x in new_columns[10]:
    all_dropped_predictors.append(x)
all_dropped_predictors

['weekday_is_monday',
 'weekday_is_saturday',
 'LDA_00',
 'n_unique_tokens',
 'n_non_stop_words',
 'self_reference_avg_sharess',
 'rate_positive_words',
 'global_subjectivity',
 'avg_positive_polarity',
 'LDA_01',
 'LDA_04',
 'LDA_02',
 'kw_max_max',
 'kw_avg_max',
 'kw_avg_min',
 'kw_max_avg',
 'is_weekend',
 'global_rate_positive_words',
 'rate_negative_words',
 'min_negative_polarity',
 'max_negative_polarity',
 'abs_title_sentiment_polarity']