In [15]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LassoCV
import math
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn import preprocessing


DERMATOLOGY DATA

In [16]:
filename = 'dermatology_data.txt'
data = np.loadtxt(filename, delimiter=',', skiprows=1, dtype=str)
data= pd.DataFrame(data)
missing_values=np.where(data.iloc[:,:]=='?')
data=data.drop(missing_values[0])
data.astype('int')


features=data.columns[0:34]
X = data.loc[:,features]
y = data.loc[:,34].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=42)

In [17]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    #print('Model Performance')
    print('Improved Accuracy = %',metrics.accuracy_score(test_labels, predictions))

Decision Tree for Dermatology Data

In [18]:
model1 = DecisionTreeClassifier(random_state=0)
model1.fit(X_train, y_train)
y_test_pred=model1.predict(X_test)
#Score the model
print("Accuracy for base model DT:",metrics.accuracy_score(y_test, y_test_pred))

Accuracy for base model DT: 0.9166666666666666


Hyperparameter Tuning for DT 

In [19]:
parameters = {'max_depth':[2,3,8,10], 'min_samples_leaf':[2,3,4,5]}

tuning1 = GridSearchCV(estimator =DecisionTreeClassifier(random_state=10), 
            param_grid = parameters, scoring='accuracy',n_jobs=4, cv=5)
tuning1.fit(X_train,y_train)

print(tuning1.best_params_)
print("Best score of the best model on training data:",tuning1.best_score_)
best_model1 = tuning1.best_estimator_
best_model1_accuracy1 = evaluate(best_model1, X_test,y_test)

{'max_depth': 8, 'min_samples_leaf': 3}
Best score of the best model on training data: 0.96
Improved Accuracy = % 0.9629629629629629


Random Forest for Dermatology Data

In [20]:
model2=RandomForestClassifier(random_state=0)
model2.fit(X_train, y_train)
y_train_pred2=model2.predict(X_train)
y_test_pred2=model2.predict(X_test)

# Score the model
print("Accuracy for base model RF:",metrics.accuracy_score(y_test, y_test_pred2))

Accuracy for base model RF: 0.9814814814814815


Hyperparameter Tuning for RF

In [21]:
a=len(X.columns)
b=round(math.sqrt(len(X.columns)))
c=round((len(X.columns))/3)
parameters = {'max_features': [a,b,c]}

tuning2 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=500,min_samples_leaf=5, random_state=0), 
            param_grid = parameters, scoring='accuracy',n_jobs=4, cv=5)
tuning2.fit(X_train,y_train)

print(tuning2.best_params_)
print("Best score of the best model on training data:",tuning2.best_score_)

best_model2 = tuning2.best_estimator_
best_model2_accuracy2 = evaluate(best_model2, X_test,y_test)

{'max_features': 6}
Best score of the best model on training data: 0.9677551020408164
Improved Accuracy = % 0.9722222222222222


Penalized Regression for Dermatology Data

In [22]:
C = np.logspace(-4, 1, 10) 
for c in C:
    model3 = LogisticRegression(penalty='l1', C=c, solver='liblinear')
    model3.fit(X_train, y_train)
    #y_pred=model3.predict(X_test)
    print('C:', c)
    #print('Coefficient of each feature:', clf.coef_)
    print('Training accuracy for RPA:', model3.score(X_train, y_train))
    print('Test accuracy for RPA:', model3.score(X_test, y_test))
    print('')

C: 0.0001
Training accuracy for RPA: 0.3453815261044177
Test accuracy for RPA: 0.23148148148148148

C: 0.00035938136638046257
Training accuracy for RPA: 0.3453815261044177
Test accuracy for RPA: 0.23148148148148148

C: 0.001291549665014884
Training accuracy for RPA: 0.3453815261044177
Test accuracy for RPA: 0.23148148148148148

C: 0.004641588833612782
Training accuracy for RPA: 0.3453815261044177
Test accuracy for RPA: 0.23148148148148148

C: 0.016681005372000592
Training accuracy for RPA: 0.6265060240963856
Test accuracy for RPA: 0.48148148148148145

C: 0.05994842503189409
Training accuracy for RPA: 0.8875502008032129
Test accuracy for RPA: 0.8333333333333334

C: 0.21544346900318845
Training accuracy for RPA: 0.9678714859437751
Test accuracy for RPA: 0.9814814814814815

C: 0.7742636826811278
Training accuracy for RPA: 0.9799196787148594
Test accuracy for RPA: 0.9907407407407407

C: 2.782559402207126
Training accuracy for RPA: 0.9879518072289156
Test accuracy for RPA: 0.990740740740740

Gradient Boosting for Dermatology Data

In [23]:
model4 = GradientBoostingClassifier(random_state=(0))
model4.fit(X_train, y_train)
y_pred = model4.predict(X_test)
print("Accuracy for base model SGB:",metrics.accuracy_score(y_test, y_pred))

Accuracy for base model SGB: 0.9629629629629629


In [24]:
parameters = {'learning_rate':[0.2,0.15,0.1,0.05,0.01], 'n_estimators':[100,250,500,750]}

tuning3 = GridSearchCV(estimator =GradientBoostingClassifier(max_features='sqrt', random_state=10), 
            param_grid = parameters, scoring='accuracy',n_jobs=4, cv=5)
tuning3.fit(X_train,y_train)

print(tuning3.best_params_)
print("Best score of the best model on training data:",tuning3.best_score_)

{'learning_rate': 0.01, 'n_estimators': 250}
Best score of the best model on training data: 0.9798367346938776


In [25]:
max_depth = {'max_depth':[2,3,4,5] }
tuning4 = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=0.01,n_estimators=250,max_features='sqrt', random_state=10), 
            param_grid = max_depth, scoring='accuracy',n_jobs=4, cv=5)
tuning4.fit(X_train,y_train)
print(tuning4.best_params_)
print("Best score of the best model on training data:",tuning4.best_score_)

{'max_depth': 3}
Best score of the best model on training data: 0.9798367346938776


In [26]:
best_model = GradientBoostingClassifier(learning_rate=0.01,n_estimators=250,max_features='sqrt',max_depth=3,random_state=10)
best_model.fit(X_train,y_train)
best_accuracy = evaluate(best_model, X_test,y_test)

Improved Accuracy = % 0.9722222222222222


ABSENTEEISM DATA

In [27]:
data=pd.read_csv("Absenteeism_at_work.csv",delimiter=';')


data.iloc[:,1]=data.iloc[:,1].astype('category')
data.iloc[:,3:5]=data.iloc[:,3:5].astype('category')
data.iloc[:,11:13]=data.iloc[:,11:13].astype('category')
data.iloc[:,14:16]=data.iloc[:,14:16].astype('category')
zero_target_values=np.where(data['Absenteeism time in hours']==0)
data.loc[zero_target_values[0],'Absenteeism time in hours']=0.00001

features=data.columns[1:20]
X = data.loc[:,features]
y = data.loc[:,'Absenteeism time in hours'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=42)

In [28]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    print('Model Performance')
    print('Improved RMSE Error:',sqrt(mean_squared_error(test_labels,predictions)))

Decision Tree for Absenteeism Data

In [29]:
model1 = DecisionTreeRegressor(random_state=(10))
model1.fit(X_train, y_train)
y_test_pred=model1.predict(X_test)
#y_test_pred=y_test_pred.round()
print('RMSE for base model DT:',sqrt(mean_squared_error(y_test,y_test_pred)))

RMSE for base model DT: 17.467906364322907


Hyperparameters Tuning for DT

In [30]:
parameters = {'max_depth':[2,3,8,10], 'min_samples_leaf':[2,3,4,5]}

tuning1 = GridSearchCV(estimator = DecisionTreeRegressor(random_state = 10), 
            param_grid = parameters,n_jobs=4, cv=5)
tuning1.fit(X_train,y_train)

print(tuning1.best_params_)
print("Best score of the best model on training data:",tuning1.best_score_)
best_model1 = tuning1.best_estimator_
best_model1_accuracy1 = evaluate(best_model1, X_test,y_test)

{'max_depth': 3, 'min_samples_leaf': 2}
Best score of the best model on training data: 0.19692653399256793
Model Performance
Improved RMSE Error: 15.427723367254165


Random Forest for Absenteeism Data

In [31]:
model2 = RandomForestRegressor(random_state=(0))
model2.fit(X_train, y_train)
y_test_pred2=model2.predict(X_test)
#y_test_pred2=y_test_pred2.round()
print('RMSE for base model RF:',sqrt(mean_squared_error(y_test,y_test_pred2))) 

RMSE for base model RF: 15.012515148727063


Hyperparameters Tuning for RF

In [32]:
a=len(X.columns)
b=round(math.sqrt(len(X.columns)))
c=round((len(X.columns))/3)
parameters = {'max_features': [a,b,c]}

tuning2 = GridSearchCV(estimator=RandomForestRegressor(n_estimators=500,min_samples_leaf=5, random_state=0), 
            param_grid = parameters,n_jobs=4, cv=5)
tuning2.fit(X_train,y_train)

print(tuning2.best_params_)
print("Best score of the best model on training data:",tuning2.best_score_)
best_random = tuning2.best_estimator_
random_accuracy2 = evaluate(best_random, X_test,y_test)

{'max_features': 19}
Best score of the best model on training data: 0.22091346815003457
Model Performance
Improved RMSE Error: 13.921751297593506


PRA for Absenteeism Data

In [34]:
alphas = np.logspace(-4, 1, 10)
lassocv = linear_model.LassoCV(alphas=alphas,cv=5, random_state=0, max_iter = 2000)
lassocv.fit(X_train, y_train)
lassocv_score_on_train = lassocv.score(X_train, y_train)
lassocv_score_on_test = lassocv.score(X_test, y_test)
lassocv_alphas = lassocv.alphas_
lassocv_alpha = lassocv.alpha_
best_lasso = linear_model.Lasso(alpha=lassocv_alpha)
best_lasso.fit(X_train, y_train)
y_test_pred3=best_lasso.predict(X_test)
print('RMSE for PRA:',sqrt(mean_squared_error(y_test,y_test_pred3)))
print("best alpha:",lassocv_alpha)

RMSE for PRA: 13.576178783371669
best alpha: 0.05994842503189409


Gradient Boosting for Absenteeism Data

In [35]:
model4 = GradientBoostingRegressor(random_state=(0))
model4.fit(X_train, y_train)
y_pred = model4.predict(X_test)
print("RMSE for base model SGB:",sqrt(mean_squared_error(y_test,y_test_pred3)))

RMSE for base model SGB: 13.576178783371669


Hyperparameters Tuning for GB

In [36]:
parameters = {'learning_rate':[0.2,0.15,0.1,0.05,0.01], 'n_estimators':[100,250,500,750]}

tuning3= GridSearchCV(estimator = model4, 
            param_grid = parameters,n_jobs=4, cv=5)
tuning3.fit(X_train,y_train)

print(tuning3.best_params_)
print("Best score of the best model on training data:",tuning3.best_score_)

{'learning_rate': 0.01, 'n_estimators': 250}
Best score of the best model on training data: 0.2182308659422282


In [37]:
max_depth = {'max_depth':[2,3,4,5] }
tuning4 = GridSearchCV(estimator =GradientBoostingRegressor(learning_rate=0.01,n_estimators=250, random_state=0), 
            param_grid = max_depth,n_jobs=4, cv=5)
tuning4.fit(X_train,y_train)
print(tuning4.best_params_)
print("Best score of the best model on training data:",tuning4.best_score_)

{'max_depth': 3}
Best score of the best model on training data: 0.2182308659422282


In [38]:
best_model = GradientBoostingRegressor(learning_rate=0.01,n_estimators=250,max_depth=3,random_state=0)
best_model.fit(X_train,y_train)
best_accuracy = evaluate(best_model, X_test,y_test)

Model Performance
Improved RMSE Error: 14.833180992106856


ENERGY DATA

In [49]:
data=pd.read_csv("energydata_complete.csv")

names=data.columns[1:28]
scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(data.iloc[:,1:28])
data = pd.DataFrame(d, columns=names)

features=data.columns[1:28]
X = data.loc[:,features]
y = data.loc[:,'Appliances'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=42)

In [50]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    print('Model Performance')
    print('Improved RMSE Error:',sqrt(mean_squared_error(test_labels,predictions)))

Decision Tree for Energy Data

In [51]:
model1 = DecisionTreeRegressor(random_state=(0))
model1.fit(X_train, y_train)
y_test_pred=model1.predict(X_test)
print('RMSE for base model DT:',sqrt(mean_squared_error(y_test,y_test_pred)))

RMSE for base model DT: 0.08948136705626476


Hyperparameters Tuning for DT

In [54]:
parameters = {'max_depth':[2,3,8,10], 'min_samples_leaf':[2,3,4,5]}

tuning1 = GridSearchCV(estimator = DecisionTreeRegressor(random_state = 10), 
            param_grid = parameters,n_jobs=4, cv=5)
tuning1.fit(X_train,y_train)

print(tuning1.best_params_)
best_model1 = tuning1.best_estimator_
best_model_accuracy1 = evaluate(best_model1, X_test,y_test)

{'max_depth': 10, 'min_samples_leaf': 5}
Model Performance
Improved RMSE Error: 0.08323374247961722


Random Forest for Energy Data

In [55]:
model2 = RandomForestRegressor(random_state=(0))
model2.fit(X_train, y_train)
y_test_pred2=model2.predict(X_test)
print('RMSE for base model RF:',sqrt(mean_squared_error(y_test,y_test_pred2))) 

RMSE for base model RF: 0.06622667194489702


Hyperparameters Tuning for RF

In [57]:
a=len(X.columns)
b=round(math.sqrt(len(X.columns)))
c=round((len(X.columns))/3)
parameters = {'max_features': [a,b,c]}

tuning2 = GridSearchCV(estimator=RandomForestRegressor(n_estimators=500,min_samples_leaf=5, random_state=0), 
            param_grid = parameters,n_jobs=4, cv=5)
tuning2.fit(X_train,y_train)

print(tuning2.best_params_)

best_model2 = tuning2.best_estimator_
best_model_accuracy2 = evaluate(best_model2, X_test,y_test)

{'max_features': 9}
Model Performance
Improved RMSE Error: 0.06950470010017393


PRA for Energy Data

In [58]:
alphas = np.logspace(-4, 1, 10)
lassocv = linear_model.LassoCV(alphas=alphas,cv=5, random_state=0, max_iter = 2000)
lassocv.fit(X_train, y_train)
lassocv_score_on_train = lassocv.score(X_train, y_train)
lassocv_score_on_test = lassocv.score(X_test, y_test)
lassocv_alphas = lassocv.alphas_
lassocv_alpha = lassocv.alpha_
best_lasso = linear_model.Lasso(alpha=lassocv_alpha)
best_lasso.fit(X_train, y_train)
y_test_pred3=best_lasso.predict(X_test)
print('RMSE for PRA:',sqrt(mean_squared_error(y_test,y_test_pred3))) 
print(lassocv_alpha)

RMSE for PRA: 0.08719947670931776
0.0001


Gradient Boosting for Energy Data

In [59]:
model4 = GradientBoostingRegressor(random_state=(0))
model4.fit(X_train, y_train)
y_pred = model4.predict(X_test)
print("RMSE for base model SGB:",sqrt(mean_squared_error(y_test,y_test_pred3)))

RMSE for base model SGB: 0.08719947670931776


Hyperparameters Tuning for GB

In [60]:
parameters = {'learning_rate':[0.2,0.15,0.1,0.05,0.01], 'n_estimators':[100,250,500,750]}

tuning3= GridSearchCV(estimator = model4, 
            param_grid = parameters,n_jobs=4, cv=5)
tuning3.fit(X_train,y_train)

print(tuning3.best_params_)


{'learning_rate': 0.15, 'n_estimators': 750}


In [61]:
max_depth = {'max_depth':[2,3,4,5] }
tuning4 = GridSearchCV(estimator =GradientBoostingRegressor(learning_rate = tuning3.best_params_['learning_rate'], n_estimators = tuning3.best_params_['n_estimators'], random_state=0), 
            param_grid = max_depth,n_jobs=4, cv=5)
tuning4.fit(X_train,y_train)
print(tuning4.best_params_)

{'max_depth': 5}


In [62]:
best_model4 = GradientBoostingRegressor(learning_rate = tuning3.best_params_['learning_rate'], n_estimators = tuning3.best_params_['n_estimators'], max_depth = tuning4.best_params_['max_depth'],random_state=0)
best_model4.fit(X_train,y_train)
best_accuracy = evaluate(best_model4, X_test,y_test)

Model Performance
Improved RMSE Error: 0.06658608892560446


ONLINE NEWS POPULARITY 

In [63]:
data=pd.read_csv("OnlineNewsPopularity.csv")
shares_below_1400=np.where(data[' shares']<1400)
shares_above_1400=np.where(data[' shares']>=1400)
data.loc[shares_below_1400[0],' shares']=0
data.loc[shares_above_1400[0],' shares']=1
names=data.columns[1:61]
scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(data.iloc[:,1:61])
data = pd.DataFrame(d, columns=names)
data.iloc[:,12:18]=data.iloc[:,12:18].astype('category')
data.iloc[:,30:38]=data.iloc[:,30:38].astype('category')

features=data.columns[:59]
X = data[features]
y = data.loc[:,' shares']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=42)
y_test=y_test.to_numpy()
y_train=y_train.to_numpy()

In [64]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    print('Model Performance')
    print('Improved Accuracy = {:0.2f}%.',metrics.accuracy_score(test_labels, predictions))

Decision Tree for Online News Popularity

In [65]:
model1 = DecisionTreeClassifier(random_state=0)
model1.fit(X_train, y_train)
y_test_pred=model1.predict(X_test)
#Score the model
print("Accuracy for base model DT:",metrics.accuracy_score(y_test, y_test_pred))

Accuracy for base model DT: 0.5875231209012948


Hyperparameters Tuning for DT

In [66]:
parameters = {'max_depth':[2,3,8,10], 'min_samples_leaf':[2,3,4,5]}

tuning1 = GridSearchCV(estimator = DecisionTreeClassifier(random_state = 0), 
            param_grid = parameters, scoring='accuracy',n_jobs=4, cv=5)
tuning1.fit(X_train,y_train)

print(tuning1.best_params_)
print("Best score of the best model on training data:",tuning1.best_score_)
best_model1 = tuning1.best_estimator_
best_model1_accuracy1 = evaluate(best_model1, X_test,y_test)

{'max_depth': 8, 'min_samples_leaf': 4}
Best score of the best model on training data: 0.6388468468468467
Model Performance
Improved Accuracy = {:0.2f}%. 0.633176391457878


Random Forest for Online News Popularity

In [67]:
model2=RandomForestClassifier(random_state=0)
model2.fit(X_train, y_train)
y_train_pred2=model2.predict(X_train)
y_test_pred2=model2.predict(X_test)

# Score the model
print("Accuracy for RF:",metrics.accuracy_score(y_test, y_test_pred2))

Accuracy for RF: 0.6657138052799731


Hyperparameters Tuning for RF

In [68]:
a=len(X.columns)
b=round(math.sqrt(len(X.columns)))
c=round((len(X.columns))/3)
parameters = {'max_features': [a,b,c]}

tuning2 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=500,min_samples_leaf=5, random_state=0), 
            param_grid = parameters, scoring='accuracy',n_jobs=4, cv=5)
tuning2.fit(X_train,y_train)

print(tuning2.best_params_)
print("Best score of the best model on training data:",tuning2.best_score_)

best_random = tuning2.best_estimator_
random_accuracy2 = evaluate(best_random, X_test,y_test)

{'max_features': 8}
Best score of the best model on training data: 0.6756756756756757
Model Performance
Improved Accuracy = {:0.2f}%. 0.6742895577602153


RPA for Online News Popularity

In [69]:
C = np.logspace(-4, 1, 10) 
for c in C:
    model3 = LogisticRegression(penalty='l1', C=c, solver='liblinear')
    model3.fit(X_train, y_train)
    print('C:', c)
    #print('Coefficient of each feature:', clf.coef_)
    print('Training accuracy for RPA:', model3.score(X_train, y_train))
    print('Test accuracy for RPA:', model3.score(X_test, y_test))
    print('')


C: 0.0001
Training accuracy for RPA: 0.4648288288288288
Test accuracy for RPA: 0.47006894232386076

C: 0.00035938136638046257
Training accuracy for RPA: 0.4648288288288288
Test accuracy for RPA: 0.47006894232386076

C: 0.001291549665014884
Training accuracy for RPA: 0.5351711711711712
Test accuracy for RPA: 0.5299310576761392

C: 0.004641588833612782
Training accuracy for RPA: 0.624072072072072
Test accuracy for RPA: 0.6217420548175551

C: 0.016681005372000592
Training accuracy for RPA: 0.6346666666666667
Test accuracy for RPA: 0.6320834033966706

C: 0.05994842503189409
Training accuracy for RPA: 0.6467027027027027
Test accuracy for RPA: 0.6422565999663696

C: 0.21544346900318845
Training accuracy for RPA: 0.6523603603603604
Test accuracy for RPA: 0.6454514881452833

C: 0.7742636826811278
Training accuracy for RPA: 0.6556036036036036
Test accuracy for RPA: 0.6488986043383218

C: 2.782559402207126
Training accuracy for RPA: 0.6568648648648648
Test accuracy for RPA: 0.6510005044560282

C

Gradient Boosting for Online News Popularity

In [70]:
model4 = GradientBoostingClassifier(random_state=(0))
model4.fit(X_train, y_train)
y_pred = model4.predict(X_test)
print("Accuracy for base model SGB:",metrics.accuracy_score(y_test, y_pred))

Accuracy for base model SGB: 0.6658819572893896


Hyperparameters Tuning for GB

In [71]:
parameters = {'learning_rate':[0.2,0.15,0.1,0.05,0.01], 'n_estimators':[100,250,500,750]}

tuning3= GridSearchCV(estimator =GradientBoostingClassifier(max_features='sqrt', random_state=10), 
            param_grid = parameters, scoring='accuracy',n_jobs=4, cv=5)
tuning3.fit(X_train,y_train)

print(tuning3.best_params_)
print("Best score of the best model on training data:",tuning3.best_score_)

{'learning_rate': 0.1, 'n_estimators': 250}
Best score of the best model on training data: 0.6780180180180182


In [72]:
max_depth = {'max_depth':[2,3,4,5] }
tuning4 = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,max_features='sqrt', random_state=10), 
            param_grid = max_depth, scoring='accuracy',n_jobs=4, cv=5)
tuning4.fit(X_train,y_train)
print(tuning4.best_params_)
print("Best score of the best model on training data:",tuning4.best_score_)

{'max_depth': 3}
Best score of the best model on training data: 0.6780180180180182


In [73]:
best_model = GradientBoostingClassifier(learning_rate=0.1,n_estimators=250,max_features='sqrt',max_depth=3,random_state=10)
best_model.fit(X_train,y_train)
best_accuracy = evaluate(best_model, X_test,y_test)


Model Performance
Improved Accuracy = {:0.2f}%. 0.6705061375483437
