# COMP 6321 PROJECT - Team 16
## Regression Problems

<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>
<div style="border-bottom: 3px solid black"></div>

# 1. Importing Required Libraries

The code block below will import all the libraries and dependencies for regression problems.

**Run the code cell below** 

In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.svm               # For SVC
import sklearn.metrics           # For accuracy_score
import sklearn.model_selection   # For GridSearchCV and RandomizedSearchCV
import scipy
import scipy.stats               # For reciprocal distribution
import warnings
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Ridge
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
warnings.filterwarnings("ignore", category=DeprecationWarning)  # Ignore sklearn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)       # Ignore sklearn deprecation warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>
<div style="border-bottom: 3px solid black"></div>

# 2. Loading All Datasets

The code block below will load all the datasets for regression problems.


**Dataset Mapper**
1. Wine Quality -> RP_1
2. Communities and Crime -> RP_2
3. QSAR aquatic toxicity -> RP_3
4. Parkinson Speech -> RP_4
5. Facebook metrics -> RP_5
6. Bike Sharing (use hour data) -> RP_6
7. Student Performance (use just student-por.csv if you do not know how to merge the math grades) -> RP_7
8. Concrete Compressive Strength -> RP_8
9. SGEMM GPU kernel performance -> RP_9
10. Merck Molecular Activity Challenge (from Kaggle) -> RP_10

In [2]:
np.random.seed(23)
M1Tr=[]
M1Te=[]
M2Tr=[]
M2Te=[]
MList=['SVM with RBF Kernel','SVM with Linear Kernel','Decision Tree Regressor','Random Forest Regressor','AdaBoost Regressor','Gaussian Process Regressor','Linear Regressor','Ridge Regressor']

"""
Splits the data into Features (X) and Labels (y)
"""
def splitData(data):
    X = data.iloc[:,0:len(data.columns)-1]
    y = data.iloc[:,-1]
    return X,y

"""
Splits data into Training Set and Testing Set. 
Size Ratio of Train:Test is 70:30 
"""
def getTrainTestData(data):
    X,y = splitData(data)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y,test_size=0.3,random_state=23)
    return X_train, X_test, y_train, y_test
    

"""
Converts categorical features by encoding
"""
def convertCategorical(df):
    categorical_feature_mask = df.dtypes==object
    categorical_cols = df.columns[categorical_feature_mask].tolist()
    le = LabelEncoder()
    df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))
    return df;

def minMax(x):
    return pd.Series(index=['min','max'],data=[x.min(),x.max()])


In [3]:
# Wine Quality Data | 11 Features | 6497 Samples
RP_1_red = pd.read_csv("RP_Data/winequality-red.csv", sep=";",header=None,skiprows=1)
RP_1_white = pd.read_csv("RP_Data/winequality-white.csv", sep=";",header=None,skiprows=1)
RP_1 = pd.concat([RP_1_red,RP_1_white])
# print(RP_1.apply(minMax))
RP_1_X_train, RP_1_X_test, RP_1_y_train, RP_1_y_test = getTrainTestData(RP_1)
scaler = StandardScaler().fit(RP_1_X_train)
RP_1_X_train = scaler.transform(RP_1_X_train)
RP_1_X_test = scaler.transform(RP_1_X_test)

# Communities and Crime Data| 127 Features | 1994 Samples 
RP_2 = pd.read_csv('RP_Data/communities.data',header=None)
RP_2 = RP_2.drop([0, 1, 2, 3, 4], axis=1) # removed first 5 columns. non predictive
RP_2 = RP_2.replace('?',0)
# print(RP_2.apply(minMax))
RP_2_X_train, RP_2_X_test, RP_2_y_train, RP_2_y_test = getTrainTestData(RP_2)


# QSAR Aquatic Toxicity Data | 8 Features | 546 Samples
RP_3 = pd.read_csv("RP_Data/qsar_aquatic_toxicity.csv", sep=";",header=None)
# print(RP_3.apply(minMax))
RP_3_X_train, RP_3_X_test, RP_3_y_train, RP_3_y_test = getTrainTestData(RP_3)
scaler = StandardScaler().fit(RP_3_X_train)
RP_3_X_train = scaler.transform(RP_3_X_train)
RP_3_X_test = scaler.transform(RP_3_X_test)


# Parkinson Speech Data | 14 Features | 690 Samples
RP_4 = pd.read_csv("RP_Data/parkinsons_train_data.txt", sep=",",header=None)
RP_4 = RP_4.drop([28], axis=1) # removed last column. categorical output
# print(RP_4_train.shape)
# print(RP_4_test.shape)
# print(RP_4_train[28].nunique())
# print(RP_4_test[27].nunique())
RP_4_X_train, RP_4_X_test, RP_4_y_train, RP_4_y_test = getTrainTestData(RP_4)
scaler = StandardScaler().fit(RP_4_X_train)
RP_4_X_train = scaler.transform(RP_4_X_train)
RP_4_X_test = scaler.transform(RP_4_X_test)




# Facebook Data | 18 Features | 500 Samples 
RP_5 = pd.read_csv("RP_Data/dataset_Facebook.csv", sep=";")
RP_5 = convertCategorical(RP_5)
RP_5  = RP_5.fillna(RP_5.mean())
# print(RP_5.apply(minMax))
RP_5_X_train, RP_5_X_test, RP_5_y_train, RP_5_y_test = getTrainTestData(RP_5)
scaler = StandardScaler().fit(RP_5_X_train)
RP_5_X_train = scaler.transform(RP_5_X_train)
RP_5_X_test = scaler.transform(RP_5_X_test)


# Bike Sharing Hours Data | 16 Features | 17379 Samples 
RP_6 = pd.read_csv("RP_Data/hour.csv", sep=",")
RP_6 = RP_6.drop(['instant'], axis=1) # removed second last column. non predictive
RP_6 = convertCategorical(RP_6)
# print(RP_6.apply(minMax))
RP_6_X_train, RP_6_X_test, RP_6_y_train, RP_6_y_test = getTrainTestData(RP_6)
scaler = StandardScaler().fit(RP_6_X_train)
RP_6_X_train = scaler.transform(RP_6_X_train)
RP_6_X_test = scaler.transform(RP_6_X_test)


# Student Performance Data | 32 Features | 4934964982 Samples
RP_7 = pd.read_csv("RP_Data/student-por.csv", sep=";")
RP_7 = convertCategorical(RP_7)
# print(RP_7.apply(minMax))
RP_7_X_train, RP_7_X_test, RP_7_y_train, RP_7_y_test = getTrainTestData(RP_7)


# Concrete Data | 8 Features | 1030 Samples
RP_8 =  pd.read_excel (r'RP_Data/Concrete_Data.xls',skiprows=1,header=None)
# print(RP_8.apply(minMax))
RP_8_X_train, RP_8_X_test, RP_8_y_train, RP_8_y_test = getTrainTestData(RP_8)
scaler = StandardScaler().fit(RP_8_X_train)
RP_8_X_train = scaler.transform(RP_8_X_train)
RP_8_X_test = scaler.transform(RP_8_X_test)


# SGEMM GPU kernel performance Data | 14 Features | 241600 Samples 
RP_9 = pd.read_csv("RP_Data/sgemm_product.csv", sep=",")
RP_9['Run (ms)'] = RP_9.iloc[:, -4:].sum(axis=1)/4 #add column of average run of 4 runs
RP_9 = RP_9.drop(['Run1 (ms)','Run2 (ms)','Run3 (ms)','Run4 (ms)'], axis = 1)  #drop 4 runs column
# print(RP_9.apply(minMax))
RP_9_X_train, RP_9_X_test, RP_9_y_train, RP_9_y_test = getTrainTestData(RP_9)
scaler = StandardScaler().fit(RP_9_X_train)
RP_9_X_train = scaler.transform(RP_9_X_train)
RP_9_X_test = scaler.transform(RP_9_X_test)


#Merck Molecular Dataset 1 | 5877 Features | 8716 Samples 
npzfile = np.load('RP_Data/File1.npz')
RP_101_X = npzfile['arr_0']
RP_101_y = npzfile['arr_1']
RP_101_X_train, RP_101_X_test, RP_101_y_train, RP_101_y_test = sklearn.model_selection.train_test_split(RP_101_X,RP_101_y,test_size=0.3,random_state=23)


#Merck Molecular Dataset 2 | 4306 Features | w Samples 
npzfile = np.load('RP_Data/File2.npz')
RP_102_X = npzfile['arr_0']
RP_102_y = npzfile['arr_1']
RP_102_X_train, RP_102_X_test, RP_102_y_train, RP_102_y_test = sklearn.model_selection.train_test_split(RP_102_X,RP_102_y,test_size=0.3,random_state=23)

print('Regression Data Loaded Successfully.')

Regression Data Loaded Successfully.
