In [1]:
import numpy as np   
from sklearn.linear_model import LogisticRegression
import pandas as pd    
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

In [2]:
df = pd.read_excel("ECAP Dataset.xlsx")  
df.shape

(154, 12)

In [3]:
df.dtypes

Corner Angle            int64
Temperature             int64
Speed                 float64
Friction Factor       float64
Temp of die             int64
Material               object
Initial Density       float64
Back Pressure           int64
Max Load              float64
Final Volume          float64
Final Density         float64
Density Percentage    float64
dtype: object

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Corner Angle        154 non-null    int64  
 1   Temperature         154 non-null    int64  
 2   Speed               154 non-null    float64
 3   Friction Factor     154 non-null    float64
 4   Temp of die         154 non-null    int64  
 5   Material            154 non-null    object 
 6   Initial Density     154 non-null    float64
 7   Back Pressure       154 non-null    int64  
 8   Max Load            154 non-null    float64
 9   Final Volume        154 non-null    float64
 10  Final Density       154 non-null    float64
 11  Density Percentage  154 non-null    float64
dtypes: float64(7), int64(4), object(1)
memory usage: 14.6+ KB


In [5]:
df.head(15)

Unnamed: 0,Corner Angle,Temperature,Speed,Friction Factor,Temp of die,Material,Initial Density,Back Pressure,Max Load,Final Volume,Final Density,Density Percentage
0,10,300,1.0,0.05,20,TI-10V-2Fe-3Al,0.5,200,22381.2,3217.26,0.783275,78.327521
1,30,150,0.5,0.05,200,TI-10V-2Fe-3Al,0.8,200,109092.6,3698.1003,1.09029,109.028952
2,40,300,1.0,0.05,20,TI-10V-2Fe-3Al,0.5,200,24382.8,3181.59,0.792057,79.20568
3,0,300,1.0,0.01,20,TI-10V-2Fe-3Al,0.5,200,31172.4,2819.67,0.893722,89.372161
4,0,300,1.0,0.1,20,TI-10V-2Fe-3Al,0.5,200,90846.0,2304.63,1.093451,109.345101
5,60,100,2.0,0.05,200,TI-13C11Cr3Al,0.8,200,85752.84,3731.0646,1.080657,108.065671
6,0,300,2.0,0.05,20,TI-10V-2Fe-3Al,0.5,200,89893.2,2322.03,1.085257,108.52573
7,20,300,1.0,0.07,20,TI-10V-2Fe-3Al,0.5,200,44000.4,2775.3,0.90801,90.800994
8,80,250,1.0,0.05,50,TI-13C11Cr3Al,0.5,200,23928.0,3188.55,0.790328,79.032789
9,0,200,1.0,0.05,20,TI-10V-2Fe-3Al,0.5,200,101228.4,2335.95,1.07879,107.879021


In [6]:
df.describe(include='all')

Unnamed: 0,Corner Angle,Temperature,Speed,Friction Factor,Temp of die,Material,Initial Density,Back Pressure,Max Load,Final Volume,Final Density,Density Percentage
count,154.0,154.0,154.0,154.0,154.0,154,154.0,154.0,154.0,154.0,154.0,154.0
unique,,,,,,2,,,,,,
top,,,,,,TI-10V-2Fe-3Al,,,,,,
freq,,,,,,93,,,,,,
mean,39.350649,293.116883,1.122078,0.043247,147.532468,,0.620779,96.103896,56389.804442,3398.704736,0.920729,92.072928
std,30.312648,152.527805,0.638447,0.017818,166.140588,,0.134638,69.67067,32898.220035,535.361519,0.122979,12.297873
min,0.0,30.0,0.2,0.01,20.0,,0.5,0.0,9885.62,2304.63,0.67651,67.651007
25%,10.0,200.0,0.5,0.03,20.0,,0.5,12.5,28823.40175,3086.47125,0.806426,80.642633
50%,40.0,300.0,1.0,0.05,60.0,,0.6,100.0,49896.545,3404.6095,0.927336,92.733597
75%,70.0,400.0,1.0,0.05,200.0,,0.7,150.0,83671.55625,3832.674,1.0335,103.350016


In [7]:
mat=pd.get_dummies(df['Material'])
mat.head()

Unnamed: 0,TI-10V-2Fe-3Al,TI-13C11Cr3Al
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [8]:
df=pd.concat([df,mat],axis=1)
df.head()

Unnamed: 0,Corner Angle,Temperature,Speed,Friction Factor,Temp of die,Material,Initial Density,Back Pressure,Max Load,Final Volume,Final Density,Density Percentage,TI-10V-2Fe-3Al,TI-13C11Cr3Al
0,10,300,1.0,0.05,20,TI-10V-2Fe-3Al,0.5,200,22381.2,3217.26,0.783275,78.327521,1,0
1,30,150,0.5,0.05,200,TI-10V-2Fe-3Al,0.8,200,109092.6,3698.1003,1.09029,109.028952,1,0
2,40,300,1.0,0.05,20,TI-10V-2Fe-3Al,0.5,200,24382.8,3181.59,0.792057,79.20568,1,0
3,0,300,1.0,0.01,20,TI-10V-2Fe-3Al,0.5,200,31172.4,2819.67,0.893722,89.372161,1,0
4,0,300,1.0,0.1,20,TI-10V-2Fe-3Al,0.5,200,90846.0,2304.63,1.093451,109.345101,1,0


In [9]:
df.drop(["Material","Final Density","Max Load","Final Volume"],axis=1,inplace=True)
df.head()

Unnamed: 0,Corner Angle,Temperature,Speed,Friction Factor,Temp of die,Initial Density,Back Pressure,Density Percentage,TI-10V-2Fe-3Al,TI-13C11Cr3Al
0,10,300,1.0,0.05,20,0.5,200,78.327521,1,0
1,30,150,0.5,0.05,200,0.8,200,109.028952,1,0
2,40,300,1.0,0.05,20,0.5,200,79.20568,1,0
3,0,300,1.0,0.01,20,0.5,200,89.372161,1,0
4,0,300,1.0,0.1,20,0.5,200,109.345101,1,0


In [10]:
corr=df.corr()
corr

Unnamed: 0,Corner Angle,Temperature,Speed,Friction Factor,Temp of die,Initial Density,Back Pressure,Density Percentage,TI-10V-2Fe-3Al,TI-13C11Cr3Al
Corner Angle,1.0,0.054865,0.093282,-0.039634,-0.257675,-0.007883,0.004984,-0.521151,-0.215147,0.215147
Temperature,0.054865,1.0,-0.039237,-0.081185,-0.05832,-0.227234,-0.163375,-0.198703,0.327496,-0.327496
Speed,0.093282,-0.039237,1.0,0.016639,0.1659,0.112483,-0.097971,0.000893,-0.078305,0.078305
Friction Factor,-0.039634,-0.081185,0.016639,1.0,0.009347,-0.140006,0.120819,0.005227,-0.016407,0.016407
Temp of die,-0.257675,-0.05832,0.1659,0.009347,1.0,0.367543,-0.010153,0.345633,-0.222123,0.222123
Initial Density,-0.007883,-0.227234,0.112483,-0.140006,0.367543,1.0,0.008687,0.613716,-0.141718,0.141718
Back Pressure,0.004984,-0.163375,-0.097971,0.120819,-0.010153,0.008687,1.0,0.322873,-0.035879,0.035879
Density Percentage,-0.521151,-0.198703,0.000893,0.005227,0.345633,0.613716,0.322873,1.0,-0.040466,0.040466
TI-10V-2Fe-3Al,-0.215147,0.327496,-0.078305,-0.016407,-0.222123,-0.141718,-0.035879,-0.040466,1.0,-1.0
TI-13C11Cr3Al,0.215147,-0.327496,0.078305,0.016407,0.222123,0.141718,0.035879,0.040466,-1.0,1.0


In [11]:
X=df.drop("Density Percentage",axis=1)
Y=df["Density Percentage"]

In [12]:
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.30,random_state=25)

## Linear Regression

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
reg_model = LinearRegression()
reg_model.fit(X_train, Y_train)

LinearRegression()

In [15]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, reg_model.coef_[0]))

The coefficient for Corner Angle is -0.1961029169349726
The coefficient for Temperature is -0.1961029169349726
The coefficient for Speed  is -0.1961029169349726
The coefficient for Friction Factor is -0.1961029169349726
The coefficient for Temp of die is -0.1961029169349726
The coefficient for Initial Density is -0.1961029169349726
The coefficient for Back Pressure is -0.1961029169349726
The coefficient for TI-10V-2Fe-3Al is -0.1961029169349726
The coefficient for TI-13C11Cr3Al is -0.1961029169349726


In [16]:
intercept = reg_model.intercept_
print("The intercept for our model is {}".format(intercept))

The intercept for our model is 57.91325196714428


In [133]:
reg_model.score(X_train, Y_train) 


0.7624741348248505

In [134]:
reg_model.score(X_test, Y_test) 


0.7053475373677629

In [135]:
lr_tr=reg_model.score(X_train, Y_train) 
lr_ts=reg_model.score(X_test, Y_test)

In [None]:
#test scores are low, seems to be a bit over fitting

In [19]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train2 = poly.fit_transform(X_train)
X_test2 = poly.fit_transform(X_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(X_train2, Y_train)

Y_pred = poly_clf.predict(X_test2)


print(poly_clf.score(X_train2, Y_train))

0.8886174982824112


In [20]:
print(poly_clf.score(X_test2, Y_test)) 

0.5360542528380752


In [21]:
print(X_train.shape)
print(X_train2.shape)

(107, 9)
(107, 46)


In [None]:
# polynomial regression is very overfitting
# even by increasing the variables from 9 to 46, test score has decresed

#### From this we can conclude Linear regression is not a suitable model

## Gausian NB

In [22]:
from sklearn.naive_bayes import GaussianNB

In [23]:
df['Density Percentage'] = df['Density Percentage'].astype(int)

In [24]:
df.head()

Unnamed: 0,Corner Angle,Temperature,Speed,Friction Factor,Temp of die,Initial Density,Back Pressure,Density Percentage,TI-10V-2Fe-3Al,TI-13C11Cr3Al
0,10,300,1.0,0.05,20,0.5,200,78,1,0
1,30,150,0.5,0.05,200,0.8,200,109,1,0
2,40,300,1.0,0.05,20,0.5,200,79,1,0
3,0,300,1.0,0.01,20,0.5,200,89,1,0
4,0,300,1.0,0.1,20,0.5,200,109,1,0


In [25]:
X=df.drop("Density Percentage",axis=1)
Y=df["Density Percentage"]

In [26]:
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.30,random_state=25)

In [27]:
NB_model = GaussianNB()

NB_model.fit(X_train, Y_train.ravel())

GaussianNB()

In [28]:
NB_model.score(X_train, Y_train) 

0.5233644859813084

In [29]:
NB_model.score(X_test, Y_test)

0.1276595744680851

In [30]:
# Very poor test score

## KNN

In [31]:
from sklearn.neighbors import KNeighborsRegressor
from scipy.stats import zscore

In [32]:
XScaled = X.apply(zscore) #SCALING
XScaled.describe()

Unnamed: 0,Corner Angle,Temperature,Speed,Friction Factor,Temp of die,Initial Density,Back Pressure,TI-10V-2Fe-3Al,TI-13C11Cr3Al
count,154.0,154.0,154.0,154.0,154.0,154.0,154.0,154.0,154.0
mean,7.137148000000001e-17,1.441848e-18,-2.162772e-16,-7.137148000000001e-17,-9.372013000000001e-17,1.845566e-16,3.820897e-16,4.037175e-17,-1.066968e-16
std,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263
min,-1.302395,-1.73067,-1.448965,-1.871957,-0.7701222,-0.8999906,-1.383903,-1.234743,-0.8098852
25%,-0.9714232,-0.612483,-0.9775408,-0.745858,-0.7701222,-0.8999906,-1.203902,-1.234743,-0.8098852
50%,0.02149166,0.0452742,-0.1918347,0.3802413,-0.5285768,-0.1548371,0.05610418,0.8098852,-0.8098852
75%,1.014407,0.7030314,-0.1918347,0.3802413,0.3168324,0.5903164,0.7761078,0.8098852,1.234743
max,1.67635,2.018546,2.95099,3.19549,2.430355,2.080624,1.496111,0.8098852,1.234743


In [33]:
NNH = KNeighborsRegressor(n_neighbors= 13 , weights = 'distance' ,metric='euclidean')

In [34]:
NNH.fit(X_train,Y_train)

KNeighborsRegressor(metric='euclidean', n_neighbors=13, weights='distance')

In [35]:
predicted_labels = NNH.predict(X_test)
NNH.score(X_test,Y_test)

0.4933331977047164

In [40]:
# Trying with different n_neighbors

NNH = KNeighborsRegressor(n_neighbors= 25 , weights = 'distance' ,metric='euclidean') 
NNH.fit(X_train,Y_train)
predicted_labels = NNH.predict(X_test)
NNH.score(X_test,Y_test)

# no much significant change

0.4757179994186319

In [41]:
# Trying with weights = 'uniform'

NNH = KNeighborsRegressor(n_neighbors= 25 , weights = 'uniform' ,metric='euclidean') 
NNH.fit(X_train,Y_train)
predicted_labels = NNH.predict(X_test)
NNH.score(X_test,Y_test)

# score drops

0.2538889383618539

In [43]:
# Trying with weights = 'uniform'

NNH = KNeighborsRegressor(n_neighbors= 25 , weights = 'distance') 
NNH.fit(X_train,Y_train)
predicted_labels = NNH.predict(X_test)
NNH.score(X_test,Y_test)

# score drops

0.4757179994186319

In [None]:
# score doesnt improve much
# Test score is always less than 0.50 

## SVM

In [44]:
from sklearn import svm

In [45]:
from sklearn.svm import SVR

In [52]:
reg = svm.SVR(kernel='rbf',gamma='auto', C=1)
reg.fit(X_train,Y_train)

SVR(C=1, gamma='auto')

In [53]:
predicted_labels = reg.predict(X_test)
reg.score(X_test,Y_test)

0.05013861822979271

In [54]:
reg = svm.SVR(kernel='linear',gamma='auto', C=1)
reg.fit(X_train,Y_train)


SVR(C=1, gamma='auto', kernel='linear')

In [55]:
predicted_labels = reg.predict(X_test)
reg.score(X_test,Y_test)

0.5623676420447816

In [56]:
reg.score(X_train, Y_train) 

0.46375110671550235

In [57]:
reg.score(X_test, Y_test)

0.5623676420447816

In [None]:
# The Performance of SVM is poor with RFG and its decent with Linear kernal. it takes a lot of time for poly.

## Decision Tree

In [139]:
from sklearn.tree import DecisionTreeRegressor

In [140]:
dTree= DecisionTreeRegressor(criterion='mse',splitter='best',random_state=25,max_depth=5)

In [141]:
dTree.fit(X_train,Y_train)

DecisionTreeRegressor(max_depth=5, random_state=25)

In [142]:
#The decision Tree algorithm is always over fitting, almost with all max_depth. 

print(dTree.score(X_train,Y_train)) 
print(dTree.score(X_test,Y_test))

dTree_tr=dTree.score(X_train,Y_train)
dTree_ts=dTree.score(X_test,Y_test)

0.9263885212113262
0.7526922843345308


In [143]:
# Trying criterion='mae'

dTree= DecisionTreeRegressor(criterion='mae',splitter='best',random_state=25,max_depth=5)

# MSE is better than MAE

In [144]:
dTree.fit(X_train,Y_train)

DecisionTreeRegressor(criterion='mae', max_depth=5, random_state=25)

In [145]:
print(dTree.score(X_train,Y_train)) 
print(dTree.score(X_test,Y_test))

0.9167759518913043
0.6885342249543114


In [146]:
# Trying differnt max_depth

dTree= DecisionTreeRegressor(criterion='mse',splitter='best',random_state=25,max_depth=3)
dTree.fit(X_train,Y_train)
print(dTree.score(X_train,Y_train)) 
print(dTree.score(X_test,Y_test))

# Over fitting for all values

0.8134029686962362
0.5150915869296614


## Bagging

In [151]:
from sklearn.ensemble import BaggingRegressor

In [152]:
bgr= BaggingRegressor (n_estimators=9,base_estimator=dTree,random_state=25)
bgr=bgr.fit(X_train,Y_train)
print(bgr.score(X_train,Y_train))
print(bgr.score(X_test,Y_test))

0.8501319708628148
0.7017869363127922


In [153]:
#Trying different n_estimators

bgr= BaggingRegressor (n_estimators=20,base_estimator=dTree,random_state=25)
bgr=bgr.fit(X_train,Y_train)
print(bgr.score(X_train,Y_train))
print(bgr.score(X_test,Y_test))

bgr_tr=bgr.score(X_train,Y_train)
bgr_ts=bgr.score(X_test,Y_test)

# There is no much improvement

0.8508541474996767
0.7089845027088151


## AdaBoost

In [154]:
from sklearn.ensemble import AdaBoostRegressor

In [155]:
adr= AdaBoostRegressor (n_estimators=10,random_state=25)
adr=adr.fit(X_train,Y_train)
print(adr.score(X_train,Y_train))
print(adr.score(X_test,Y_test))

0.8977258446711428
0.7600364641496202


In [156]:
#Trying different n_estimators

adr= AdaBoostRegressor (n_estimators=4,random_state=25)
adr=adr.fit(X_train,Y_train)
print(adr.score(X_train,Y_train))
print(adr.score(X_test,Y_test))

adr_tr=adr.score(X_train,Y_train)
adr_ts=adr.score(X_test,Y_test)

# Still the model is overfitting

0.8444243837443042
0.7535404222650581


## GradientBoosting

In [157]:
from sklearn.ensemble import GradientBoostingRegressor

In [158]:
gbr= GradientBoostingRegressor (n_estimators=10,random_state=25)
gbr=gbr.fit(X_train,Y_train)
print(gbr.score(X_train,Y_train))
print(gbr.score(X_test,Y_test))

0.7441795930509159
0.6005801036090233


In [159]:
#Trying different n_estimators

gbr= GradientBoostingRegressor (n_estimators=17,random_state=25)
gbr=gbr.fit(X_train,Y_train)
print(gbr.score(X_train,Y_train))
print(gbr.score(X_test,Y_test))

gbr_tr=gbr.score(X_train,Y_train)
gbr_ts=gbr.score(X_test,Y_test)

# Test score improves with n_estimators but still its over fitting

0.8688577770397937
0.7246411427760943


## RandomForest

In [117]:
from sklearn.ensemble import RandomForestRegressor

In [160]:
rfr= RandomForestRegressor (n_estimators=10,random_state=25,max_features=5)
rfr=rfr.fit(X_train,Y_train)
print(rfr.score(X_train,Y_train))
print(rfr.score(X_test,Y_test))

rfr_tr=rfr.score(X_train,Y_train)
rfr_ts=rfr.score(X_test,Y_test)

0.9576953713807893
0.7380775664023801


In [161]:
#Trying different n_estimators

rfr= RandomForestRegressor (n_estimators=12,random_state=25,max_features=5)
rfr=rfr.fit(X_train,Y_train)
print(rfr.score(X_train,Y_train))
print(rfr.score(X_test,Y_test))

# no significant change

0.95609924218454
0.7291885427958499


In [162]:
#Trying different max_features

rfr= RandomForestRegressor (n_estimators=10,random_state=25,max_features=4)
rfr=rfr.fit(X_train,Y_train)
print(rfr.score(X_train,Y_train))
print(rfr.score(X_test,Y_test))

# no significant change

0.95358760468418
0.7175618446524278


In [165]:
score_res = pd.DataFrame({'Model':['Linear Regression','Bagging','AdaBoost','GradientBoosting','Decision Tree','RandomForest'],
                          'Train Score':[lr_tr,bgr_tr, adr_tr, gbr_tr,dTree_tr,rfr_tr],
                         'Test Score':[lr_ts,bgr_ts, adr_ts, gbr_ts,dTree_ts,rfr_ts]
                         })
score_res

Unnamed: 0,Model,Train Score,Test Score
0,Linear Regression,0.762474,0.705348
1,Bagging,0.850854,0.708985
2,AdaBoost,0.844424,0.75354
3,GradientBoosting,0.868858,0.724641
4,Decision Tree,0.926389,0.752692
5,RandomForest,0.957695,0.738078


In [None]:
# Linear regression is fits the best and all others are overfitting