In [0]:
import warnings

from sklearn.metrics import mean_absolute_error

warnings.filterwarnings('ignore')

# Read raw data from the file
import numpy as np
import time
import pandas              
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import datasets, linear_model

#If using google colab read the data by following lines
from google.colab import files
import io

#uploaded = files.upload()
dataset = pandas.read_csv('train.csv')

#Read test dataset
#uploaded1 = files.upload()
dataset_test = pandas.read_csv('test.csv')

In [0]:
ID = dataset_test['id']
#Drop unnecessary columns
dataset_test.drop('id',axis=1,inplace=True)

#Print all rows and columns. Dont hide any
pandas.set_option('display.max_rows', None)
pandas.set_option('display.max_columns', None)

dataset = dataset.iloc[:,1:]

# **Combining Test and Train Data for One-Hot Encoding**

In [31]:


#Print the shape of the dataset
print(dataset.shape)

split = 116

#Considering continous features
size = 15

#create a dataframe with only continuous features
data=dataset.iloc[:,split:]

dataset["loss"] = numpy.log1p(dataset["loss"])
corr_list = []

#Storing names of the columns
cols = dataset.columns

#Variable to hold the list of variables for an attribute in the train and test data
labels = []


for i in range(0,split):
    train = dataset[cols[i]].unique()
    test = dataset_test[cols[i]].unique()
    labels.append(list(set(train) | set(test)))
    

(188318, 131)


#Converting Categorical data to Continous data

In [32]:
#Converting categorical data to numerical value by using label and One-Hot Encoding

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

categories = []
for i in range(0, split):
    #Label encode
    label_encoder = LabelEncoder()
    label_encoder.fit(labels[i])
    feature = label_encoder.transform(dataset.iloc[:,i])
    
    feature = feature.reshape(dataset.shape[0], 1)
    
    #One hot encode
    onehot_encoder = OneHotEncoder(sparse=False,n_values=len(labels[i]))
    feature = onehot_encoder.fit_transform(feature)
    categories.append(feature)

# Make a 2D array from a list of 1D arrays
categories_encoded = numpy.column_stack(categories)

# Concatenate encoded attributes with continuous attributes
dataset_encoded = np.concatenate((categories_encoded,dataset.iloc[:,split:].values),axis=1)

#printing shape of final encoded features
print(np.shape(dataset_encoded))

del categories
del feature
del categories_encoded

# Separating the  training label and train data from final dataset
training_data= dataset_encoded[:,0:1190]
training_labels= dataset_encoded[:,1190]

# Splitting the read data into training data set and testing data set. 90% as train. 10% as test
training_features,  test_features, training_target, test_target = train_test_split(training_data,training_labels, test_size=0.1,random_state=0)









(188318, 1191)


#**Implementing** Linear Regression

In [12]:

from sklearn.linear_model import LinearRegression
total = 0
total1=0
regr = LinearRegression(fit_intercept='True',n_jobs=-1)

#Passing features and target value to fit function to perform Linear Regression
regr.fit(training_features, training_target)

#Predicting the output value using test data features
y_pred = regr.predict(test_features)

#Calculating mean absolute error
e1 = mean_absolute_error(numpy.expm1(y_pred) , numpy.expm1(test_target))

print(e1)



1277.676563289273
1277.676563289273


# Implementing Ridge Regression

In [13]:

regr1 = linear_model.Ridge(alpha=1.0,fit_intercept='True')

#Passing features and target value to perform Ridge Regression
regr1.fit(training_features, training_target)

#Predicting the output value using test data features
#Calculating mean absolute error
mean_error1= mean_absolute_error(numpy.expm1(test_target),numpy.expm1(regr1.predict(test_features)))

print(mean_error1)


(18832,)
1267.536993086767


# Implementing Lasso Regression

In [14]:

regr2 = linear_model.Lasso(alpha=0.001,fit_intercept='True')

#Passing features and target value to perform Lasso Regression
regr2.fit(training_features, training_target)

#Predicting the output value using test data features
#Calculating mean absolute error
mean_error2= mean_absolute_error(numpy.expm1(test_target),numpy.expm1(regr2.predict(test_features)))

print(mean_error2)


(18832,)
1262.5314595059162


# Implementing AdaBoost Regressor

In [9]:
from sklearn.ensemble import AdaBoostRegressor

#Add the n_estimators value to the below list if you want to run the algo
regr4 = AdaBoostRegressor(n_estimators=10)

#Passing features and target value to perform AdaBoost Regression   
regr4.fit(training_features, training_target)

#Predicting the output value using test data feature
#Calculating mean absolute error
result = mean_absolute_error(numpy.expm1(test_target),numpy.expm1(regr4.predict(test_features)))

print(result)

 



1567.1384491815927
1641.5385344705896
842.3515577316284


# Implementing KNN Regression

In [10]:
from sklearn.neighbors import KNeighborsRegressor

#Add the N value to the below list to change the neighbours  
regr5 = KNeighborsRegressor(n_neighbors=8,n_jobs=-1)

#Passing features and target value to perform KNN Regression
regr5.fit(training_features, training_target)


#Predicting the output value using test data feature
#Calculating mean absolute error
result = mean_absolute_error(numpy.expm1(test_target),numpy.expm1(regr5.predict(test_features)))

print(result)


1419.564929748002
5007.80005812645


# Implementing XGBoost Regressor

In [0]:
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor

regr6 = XGBRegressor(n_estimators=1000)
   
#Passing features and target value to perform XGBoost Regression
regr6.fit(training_features, training_target)

#Predicting the output value using test data feature
#Calculating mean absolute error
result = mean_absolute_error(numpy.expm1(test_target),numpy.expm1(regr4.predict(test_features)))

print(result)

#Plotting a graph that shows the 10 most important features
from xgboost import plot_importance
plot_importance(regr6, max_num_features=10)


      
        
  