Import libraries and the data

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
prediktiv = pd.read_csv('prediktiv_data.csv', index_col = 'id' )

Start by exploring the dataset

In [3]:
prediktiv.info()

# The dataset exists of 35 columns and 2930 rows.
#I can also see that feature11, feature12 and feature15 seems to have less values than the rest.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2930 entries, 1 to 2930
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   target       2930 non-null   int64  
 1   feature01    2929 non-null   float64
 2   feature02    2929 non-null   float64
 3   feature03    2930 non-null   int64  
 4   feature04    2929 non-null   float64
 5   feature05    2930 non-null   int64  
 6   feature06    2930 non-null   int64  
 7   feature07    2930 non-null   int64  
 8   feature08    2930 non-null   int64  
 9   feature09    2930 non-null   int64  
 10  feature10    2930 non-null   int64  
 11  feature10.1  2930 non-null   int64  
 12  feature11    198 non-null    object 
 13  feature12    572 non-null    object 
 14  feature13    2929 non-null   object 
 15  feature14    2930 non-null   int64  
 16  feature15    1508 non-null   object 
 17  feature16    2930 non-null   object 
 18  feature17    2930 non-null   int64  
 19  featur

TypeError: '<=' not supported between instances of 'int' and 'numpy.str_'

In [None]:
#Sum the NAN-values in feature11, feature12 and feature15
prediktiv['feature11'].isnull().sum()

In [None]:
prediktiv['feature12'].isnull().sum()

In [None]:
prediktiv['feature15'].isnull().sum()

In [20]:
#both feature11, feature12 and feature15 has a lot of NAN-values so I will start by removing them. 

prediktiv.drop(['feature11', 'feature12', 'feature15'], axis = 1, inplace = True)

In [None]:
#Lets look at the categorical features
print(prediktiv['feature13'].unique())
print(prediktiv['feature16'].unique())


In [21]:
#Lets create some dummy-variables from thoose features
prediktiv = pd.get_dummies(prediktiv, columns =['feature13', 'feature16'])

In [None]:
#There is still some NAN-values but I will just replace them with '0' since it is only a few of them in some features
print(prediktiv['feature01'].value_counts())
prediktiv['feature01'].fillna(0, inplace= True)

print(prediktiv['feature02'].value_counts())
prediktiv['feature02'].fillna(0, inplace= True)

print(prediktiv['feature04'].value_counts())
prediktiv['feature04'].fillna(0, inplace= True)


In [None]:
prediktiv.isnull().sum()

In [None]:
#Now I want too search for multicollinarity (meaning: high correlatons between the features, making them unsuitable for analysis)
corr = prediktiv.drop('target', axis = 1)
sns.heatmap(corr.corr(), center = 0);
abs(corr.corr()) > 0.8

#I want to keep the features with a correlation that is lower than 0.8
#This means that I will keep the features that is 'false' in the table below
#The diagonal 'True' is the correlation between each feature to themselves 


In [24]:
#There was 4 features with high correlation among them and I will drop theese features
prediktiv.drop(['feature01', 'feature02', 'feature05', 'feature07'], axis = 1, inplace = True)


In [None]:
#Now lets to plot the features that are left to search for outliers

fig, ax = plt.subplots(12, figsize = (15,6))
ax[0].scatter(x = prediktiv['feature03'], y = prediktiv['target'])
ax[1].scatter(x = prediktiv['feature04'], y = prediktiv['target'])
ax[2].scatter(x = prediktiv['feature06'], y = prediktiv['target'])
ax[3].scatter(x = prediktiv['feature08'], y = prediktiv['target'])
ax[4].scatter(x = prediktiv['feature09'], y = prediktiv['target'])
ax[5].scatter(x = prediktiv['feature10.1'], y = prediktiv['target'])
ax[6].scatter(x = prediktiv['feature14'], y = prediktiv['target'])
ax[7].scatter(x = prediktiv['feature17'], y = prediktiv['target'])
ax[8].scatter(x = prediktiv['feature18'], y = prediktiv['target'])
ax[9].scatter(x = prediktiv['feature19'], y = prediktiv['target'])
ax[10].scatter(x = prediktiv['feature20'], y = prediktiv['target'])


In [27]:
#Remove outliers (based on the scatterplots) that is bigger than 3 std. 

from scipy import stats
prediktiv = prediktiv[(np.abs(stats.zscore(prediktiv.feature03)) < 3)]
prediktiv = prediktiv[(np.abs(stats.zscore(prediktiv.feature04)) < 3)]


In [None]:
#I will do the same for the target so the assumtions for regression-models is fulfilled
#An assumption for regression-models is that the data should be normalised. 
#Here the target-feature is used with the log-function
#This step might be skipped if the model is not a regression since the log 'pushes' the data to a normal-distribution, which sometimes is not necessary.
prediktiv = prediktiv[(np.abs(stats.zscore(prediktiv.target)) < 3)]
prediktiv.target = np.log(prediktiv.target)
fig, axes = plt.subplots(2, figsize =(10, 6))
sns.distplot(ax = axes[0], a =prediktiv.target);
sns.boxplot(ax = axes[1], x = prediktiv.target);


In [None]:
#There is some outliers in target as well, I will get rid of that in the same way as for the other features
prediktiv = prediktiv[(np.abs(stats.zscore(prediktiv.target)) < 3)]
fig, axes = plt.subplots(2, figsize =(10, 6))
sns.distplot(ax = axes[0], a =prediktiv.target);
sns.boxplot(ax = axes[1], x = prediktiv.target);

In [None]:
prediktiv.info()

Summation  

In [None]:
#There is 26 features lef
#The categorical variables are dummys
#There is no NAN-values left in the dataset
#There is no multicollinarity between the features

#This fulfills the assumptions for my models:
#There has to be homoskedasticity (homogenic variance within each feature)
#There can not be any correaltion between the features (multicollinarity)
#If the features has a normal distribution one can make assupmtions about the paramters and it will be easier to work with
#The target y has to be in some way correlated with the feature x, which was displayed in the scatterplots.

#I want to use the 10 best features in my models:
top_features = prediktiv.corr().loc['target'].apply(np.abs).sort_values(ascending=False).index[1:11]
top_features = list(top_features)
top_features

Models using the ten best features

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

Create training and test-set

In [34]:
x = prediktiv[top_features]
y = prediktiv['target']
robust_scaler = RobustScaler()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 124)

#This is a holdout cross validation which keeps 20% of the original data as "testdata", which is kept to evaluate the model.
# The 80% that is not test-data is used for training the model to find the right parameter and prediction. 


In [None]:
print(x.columns)

Preparing a new dataframe 'df' for analysis

In [36]:
df = pd.DataFrame(index = ['MSE', 'RMSE', 'MAE'], 
                       columns = ['NULL', 'MLR', 'KNN', 'LASSO'])


#MSE is short for Mean squared Error
#This is the squared average error between each observed value and it estimated value 
#MSE is the sum of al theese errors, which means that if the MSE is low: the model is good 

#RMSE is short for Root mean squared error
#It is what it sounds like, the root of MSE.
#RMSE has the same scale as the observation since it is not squared 
#As in MSE, it is the sum of errors and therefore a low value is good


#MAE is short for Mean absolute error
#This is the average error between the observed value and estimated value
#MAE is not squared, and has the same scale as the observations 
#MAE < RMSE, since RMSE uses already squared values
#RMSE is therefore desirable when there might be big errors 

First, the NULL-modell which predicts the average of the target

In [None]:
y_pred_null_modell = y_train.mean()
df.loc['MSE', 'NULL'] = mean_squared_error(y_pred=np.repeat(y_pred_null_modell, y_train.size), y_true = y_train)
df.loc['MSE', 'NULL'] = mean_squared_error(y_pred=np.repeat(y_pred_null_modell, y_test.size), y_true = y_test)

df.loc['RMSE','NULL'] = mean_squared_error(y_pred=np.repeat(y_pred_null_modell, y_train.size), y_true=y_train, squared=False)
df.loc['RMSE','NULL'] = mean_squared_error(y_pred=np.repeat(y_pred_null_modell, y_test.size), y_true=y_test, squared=False)

df.loc['MAE','NULL'] = mean_absolute_error(y_pred=np.repeat(y_pred_null_modell, y_train.size), y_true=y_train)
df.loc['MAE','NULL'] = mean_absolute_error(y_pred=np.repeat(y_pred_null_modell, y_test.size), y_true=y_test)
print(y_pred_null_modell)


#This is the average predicted value for y when y_train. One can compare this value with the test-model to see the actual average value in y_test.


Then, a multiple linear regression

In [38]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

df.loc['MSE', 'MLR'] = mean_squared_error(y_pred=lin_reg.predict(x_train), y_true=y_train)
df.loc['MSE', 'MLR'] = mean_squared_error(y_pred=lin_reg.predict(x_test), y_true = y_test)

df.loc['RMSE','MLR'] = mean_squared_error(y_pred=lin_reg.predict(x_train), y_true=y_train, squared = False)
df.loc['RMSE','MLR'] = mean_squared_error(y_pred=lin_reg.predict(x_test), y_true=y_test, squared = False)

df.loc['MAE','MLR'] = mean_absolute_error(y_pred=lin_reg.predict(x_train), y_true=y_train)
df.loc['MAE','MLR'] = mean_absolute_error(y_pred=lin_reg.predict(x_test), y_true=y_test)


#MLR are used to describe the relationships between features, dependent and independent
#The MLR are used to predict the target-value for certain independent variables

#https://www.scribbr.com/statistics/multiple-linear-regression/

Then, the KNN (K-nearest neighbor)-model

In [39]:
from sklearn.neighbors import KNeighborsRegressor
knn_model = KNeighborsRegressor(n_neighbors = 10, weights = 'distance', metric = 'euclidean')
knn_model.fit(x_train, y_train)

df.loc['MSE','KNN'] = mean_squared_error(y_pred=knn_model.predict(x_train), y_true=y_train)
df.loc['MSE','KNN'] = mean_squared_error(y_pred=knn_model.predict(x_test), y_true=y_test)

df.loc['RMSE','KNN'] = mean_squared_error(y_pred=knn_model.predict(x_train), y_true=y_train, squared = False)
df.loc['RMSE','KNN'] = mean_squared_error(y_pred=knn_model.predict(x_test), y_true=y_test, squared = False)

df.loc['MAE','KNN'] = mean_absolute_error(y_pred=knn_model.predict(x_train), y_true=y_train)
df.loc['MAE','KNN'] = mean_absolute_error(y_pred=knn_model.predict(x_test), y_true=y_test)

#Knn is a cluster-method which uses the k nearest value 
#In a regression the model predicts the best output value from the nearest values in point
#In this way one can make predictions from choosed distance, in this case the euclidean.
#Connection the predictions creates a regression and the MSE etc. 
#A high k-value avoids overfitting but it can also lose some predictive power on the margins 

#https://towardsdatascience.com/the-basics-knn-for-classification-and-regression-c1e8a6c955

And last, the Lasso-model

In [None]:
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=0.05)
lasso_model.fit(x_train, y_train)

df.loc['MSE','LASSO'] = mean_squared_error(y_pred=lasso_model.predict(x_train), y_true=y_train)
df.loc['MSE','LASSO'] = mean_squared_error(y_pred=lasso_model.predict(x_test), y_true=y_test)

df.loc['RMSE','LASSO'] = mean_squared_error(y_pred=lasso_model.predict(x_train), y_true=y_train, squared= False)
df.loc['RMSE','LASSO'] = mean_squared_error(y_pred=lasso_model.predict(x_test), y_true=y_test, squared = False)

df.loc['MAE','LASSO'] = mean_absolute_error(y_pred=lasso_model.predict(x_train), y_true=y_train)
df.loc['MAE','LASSO'] = mean_absolute_error(y_pred=lasso_model.predict(x_test), y_true=y_test)


#Lasso-model are shrinking the values towards a central point.
#The regression 'punishes' features with high correaltion and sets them to 0
#Large error means that the model might eliminate some coefficients
#Since the ten best features has been choosen for the model, lasso might not be needed

print('Features choosen by Lasso:\n')
for i, var in enumerate(x.columns[lasso_model.coef_>0]):
    print("{}.{}".format(i+1, var))

# https://www.statisticshowto.com/lasso-regression/

In [None]:
df

In [None]:
fig, ax = plt.subplots(figsize=(6,4))
df.T.plot(kind='barh', ax=ax)
ax.set_title('Error metrics for NULL, MLR, KNN and LASSO-models');

RESULT

In [283]:
#In each of theese models the outcome is based on finding the ordinary least squares.
#This means finding the value between the observed and estimated value
#This value is an error, as explained before, and this error must be minimized for the model to be considered as good.


#For this data the lowest value was predicted in the multiple regression model, which tells us that this model has the best prediction for our target Y ('target'), based on selected features.
#The LASSO- and KNN-model did also predict small value for MSE but failed at RMSE and MAE, which has significantly higher values
#Lasso decided to keep only 5 of the features, which also must be included in the analysis since this is only half of the selected features.

