In [1]:
# From Abhay Deshpande ___ https://github.com/d-abhay
# "1. Simple Linear Regression & 2. Multiple Linear Regression"
# importing libraries and reading data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('C:/Users/ABHAY/Desktop/carbon_nanotubes.csv', sep=';')
df.head()

Unnamed: 0,Chiral indice n,Chiral indice m,Initial atomic coordinate u,Initial atomic coordinate v,Initial atomic coordinate w,Calculated atomic coordinates u',Calculated atomic coordinates v',Calculated atomic coordinates w'
0,2,1,679005,701318,17033,721039,730232,17014
1,2,1,717298,642129,231319,738414,65675,232369
2,2,1,489336,303751,88462,477676,263221,88712
3,2,1,413957,632996,40843,408823,657897,39796
4,2,1,334292,543401,15989,303349,558807,157373


In [3]:
# checking for null values or missing data
df.isna().sum()

Chiral indice n                     0
Chiral indice m                     0
Initial atomic coordinate u         0
Initial atomic coordinate v         0
Initial atomic coordinate w         0
Calculated atomic coordinates u'    0
Calculated atomic coordinates v'    0
Calculated atomic coordinates w'    0
dtype: int64

In [4]:
# creating proper dataframe by replacing , by . in columns
for column in df.columns:
    if df[column].dtype=='object':
        df[column] = [float(string.replace(',', '.')) for string in df[column]]
        
df.head()

Unnamed: 0,Chiral indice n,Chiral indice m,Initial atomic coordinate u,Initial atomic coordinate v,Initial atomic coordinate w,Calculated atomic coordinates u',Calculated atomic coordinates v',Calculated atomic coordinates w'
0,2,1,0.679005,0.701318,0.017033,0.721039,0.730232,0.017014
1,2,1,0.717298,0.642129,0.231319,0.738414,0.65675,0.232369
2,2,1,0.489336,0.303751,0.088462,0.477676,0.263221,0.088712
3,2,1,0.413957,0.632996,0.040843,0.408823,0.657897,0.039796
4,2,1,0.334292,0.543401,0.15989,0.303349,0.558807,0.157373


In [5]:
#1. Simple Linear Regression
#Simple linear regression has only one x and one y variable. 
#It is an approach for predicting a quantitative response using a single feature.
#It establishes the relationship between two variables using a straight line.
#Linear regression attempts to draw a line that comes closest to the data by finding the slope and intercept that define the line and minimize regression errors.
#Formula: Y = β0 + β1X + e
#Y = Dependent variable / Target variable
#β0 = Intercept of the regression line 
#β1 = Slope of the regression lime which tells whether the line is increasing or decreasing
#X = Independent variable / Predictor variable
#e = Error

# Performing Linear regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [6]:
#Setting the value for X and Y
x = df[['Initial atomic coordinate u']]
y = df["Calculated atomic coordinates u'"]

In [7]:
# Performing train(70%) test(30%) split on data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 100)

In [8]:
# fitting linear regression
slr= LinearRegression()  
slr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
#Printing the model coefficients
print('Intercept: ', slr.intercept_)
print('Coefficient:', slr.coef_)

Intercept:  -0.007620208210471491
Coefficient: [1.01532402]


In [10]:
print('Regression Equation: calculated coordinate = -0.007620208210471491 + 1.01532402 * Initial coordinate')

Regression Equation: calculated coordinate = -0.007620208210471491 + 1.01532402 * Initial coordinate


In [11]:
#Prediction of Test and Training set result  
y_pred_slr= slr.predict(x_test)  
x_pred_slr= slr.predict(x_train)  

In [12]:
print("Prediction for test set: {}".format(y_pred_slr))

Prediction for test set: [0.87918715 0.85358169 0.86297445 ... 0.89750765 0.83210353 0.16364262]


In [13]:
#Actual value and the predicted value
slr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_slr})
slr_diff

Unnamed: 0,Actual value,Predicted value
6153,0.879448,0.879187
3330,0.853764,0.853582
1431,0.867942,0.862974
9372,0.908340,0.910221
8552,0.275026,0.273920
...,...,...
3352,0.213956,0.218991
10709,0.609650,0.603929
6471,0.899588,0.897508
6175,0.834665,0.832104


In [14]:
#Predict for any value
slr.predict([[0.213956]])

array([0.20961446])

In [15]:
#Predict for any value
slr.predict([[0.834665]])

array([0.83983522])

In [16]:
# print the R-squared value for the model
from sklearn.metrics import accuracy_score
print('R squared value of the model: {:.2f}'.format(slr.score(x,y)*100))

R squared value of the model: 99.98


In [17]:
# Conclusion: 99.98% of the data fit the regression model

In [18]:
# 0 means the model is perfect. Therefore the value should be as close to 0 as possible
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_slr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_slr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_slr))

print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

Mean Absolute Error: 0.002546929415174603
Mean Square Error: 1.4763829106831505e-05
Root Mean Square Error: 0.0038423728484923876


In [19]:
# 2. Multiple Linear Regression
# Multiple linear regression has one y and two or more x variables.
# It is an extension of Simple Linear regression as it takes more than one predictor variable to predict the response variable.
# Multiple Linear Regression is one of the important regression algorithms which models the linear relationship between a single dependent continuous variable and more than one independent variable.
# Formula: Y = β0 + β1X1 + β2X2 + β3X3 + ... + βnXn + e
#Y = Dependent variable / Target variable
#β0 = Intercept of the regression line 
#β1, β2,..βn = Slope of the regression lime which tells whether the line is increasing or decreasing
#X1, X2,..Xn = Independent variables / Predictor variables
#e = Error

In [20]:
# splitting dataset
x = df[[ 'Chiral indice n', 'Chiral indice m', 'Initial atomic coordinate u', 'Initial atomic coordinate v', 'Initial atomic coordinate w']]
y = df["Calculated atomic coordinates u'"]

In [21]:
#Splitting the dataset
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.3, random_state=100)  

In [22]:
#Fitting the Multiple Linear Regression model
mlr = LinearRegression()  
mlr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [23]:
#Intercept and Coefficient
print("Intercept: ", mlr.intercept_)
print("Coefficients:")
list(zip(x, mlr.coef_))

Intercept:  -0.0083583249320629
Coefficients:


[('Chiral indice n', -1.7672908677824462e-05),
 ('Chiral indice m', -3.272386056223359e-05),
 ('Initial atomic coordinate u', 1.0140464514822374),
 ('Initial atomic coordinate v', 0.002573201665501266),
 ('Initial atomic coordinate w', 0.0006925513631312467)]

In [24]:
#Prediction of test set
#Predicting the Test and Train set result 
y_pred_mlr= mlr.predict(x_test)  
x_pred_mlr= mlr.predict(x_train)  

In [25]:
print("Prediction for test set: {}".format(y_pred_mlr))

Prediction for test set: [0.8789083  0.85408367 0.86310268 ... 0.89713564 0.83285662 0.16364472]


In [26]:
#Actual value and the predicted value
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff

Unnamed: 0,Actual value,Predicted value
6153,0.879448,0.878908
3330,0.853764,0.854084
1431,0.867942,0.863103
9372,0.908340,0.910553
8552,0.275026,0.275007
...,...,...
3352,0.213956,0.219556
10709,0.609650,0.603094
6471,0.899588,0.897136
6175,0.834665,0.832857


In [27]:
# print the R-squared value for the model
print('R squared value of the model: {:.2f}'.format(mlr.score(x,y)*100))

R squared value of the model: 99.98


In [28]:
# Conclusion: 99.98% of the data fit the multiple regression model

In [29]:
#Model Evaluation
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R squared: {:.2f}'.format(mlr.score(x,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 99.98
Mean Absolute Error: 0.0025519380929540252
Mean Square Error: 1.4525684644319132e-05
Root Mean Square Error: 0.0038112576197784284


In [30]:
# final conclusion :
#"looking at RMSE, RMSE(MLR) < RMSE(SLR) hence multiple regression performs better than Simple linear regression for the given data"

In [31]:
#### 3. Support Vector Regression  ####

In [32]:
#1 Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [33]:
#2 Preparing the dataset
x = df[[ 'Chiral indice n', 'Chiral indice m', 'Initial atomic coordinate u', 'Initial atomic coordinate v', 'Initial atomic coordinate w']]
y = df["Calculated atomic coordinates u'"]

In [34]:
#Splitting the dataset
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, random_state=32)  

In [35]:
# By rbf kernel

In [36]:
#4 Fitting the Support Vector Regression Model to the dataset
# Create your support vector regressor here
from sklearn.svm import SVR
# most important SVR parameter is Kernel type. It can be #linear,polynomial or gaussian SVR. We have a non-linear condition #so we can select polynomial or gaussian but here we select RBF(a #gaussian type) kernel.
regressor = SVR(kernel='rbf')
regressor.fit(x_train,y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [37]:
#Prediction of test set
#Predicting the Test and Train set result 
y_pred_svr= regressor.predict(x_test)  
x_pred_svr= regressor.predict(x_train)  

In [38]:
print("Prediction for test set: {}".format(y_pred_svr))

Prediction for test set: [0.2338916  0.23748837 0.13010587 ... 0.79573453 0.45944467 0.18173325]


In [39]:
#Actual value and the predicted value
svr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_svr})
svr_diff

Unnamed: 0,Actual value,Predicted value
9316,0.217121,0.233892
1377,0.159996,0.237488
5593,0.076168,0.130106
10026,0.372940,0.438357
7733,0.807135,0.772334
...,...,...
8437,0.069784,0.120831
5261,0.673356,0.695705
6175,0.834665,0.795735
10674,0.508235,0.459445


In [40]:
# print the R-squared value for the model
print('R squared value of the model: {:.2f}'.format(regressor.score(x,y)*100))

R squared value of the model: 96.03


In [41]:
#Model Evaluation
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_svr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_svr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_svr))
print('R squared: {:.2f}'.format(regressor.score(x,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 96.03
Mean Absolute Error: 0.05143861486111229
Mean Square Error: 0.003318413972179569
Root Mean Square Error: 0.057605676562119894


In [42]:
# By linear kernel

In [43]:
#4 Fitting the Support Vector Regression Model to the dataset
# Create your support vector regressor here
from sklearn.svm import SVR
# most important SVR parameter is Kernel type. It can be #linear,polynomial or gaussian SVR. We have a non-linear condition #so we can select polynomial or gaussian but here we select RBF(a #gaussian type) kernel.
regressor = SVR(kernel='linear')
regressor.fit(x_train,y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [44]:
#Prediction of test set
#Predicting the Test and Train set result 
y_pred_svr= regressor.predict(x_test)  
x_pred_svr= regressor.predict(x_train) 

In [45]:
print("Prediction for test set: {}".format(y_pred_svr))

Prediction for test set: [0.2516303  0.24165858 0.15953374 ... 0.78134215 0.46728727 0.20757687]


In [46]:
#Actual value and the predicted value
svr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_svr})
svr_diff

Unnamed: 0,Actual value,Predicted value
9316,0.217121,0.251630
1377,0.159996,0.241659
5593,0.076168,0.159534
10026,0.372940,0.428212
7733,0.807135,0.763524
...,...,...
8437,0.069784,0.163610
5261,0.673356,0.666961
6175,0.834665,0.781342
10674,0.508235,0.467287


In [47]:
# print the R-squared value for the model
print('R squared value of the model: {:.2f}'.format(regressor.score(x,y)*100))

R squared value of the model: 95.34


In [48]:
#Model Evaluation
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_svr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_svr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_svr))
print('R squared: {:.2f}'.format(regressor.score(x,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 95.34
Mean Absolute Error: 0.05652009569691158
Mean Square Error: 0.003936467948246942
Root Mean Square Error: 0.06274127786590693


In [49]:
# By polynomial kernel

In [50]:
#4 Fitting the Support Vector Regression Model to the dataset
# Create your support vector regressor here
from sklearn.svm import SVR
# most important SVR parameter is Kernel type. It can be #linear,polynomial or gaussian SVR. We have a non-linear condition #so we can select polynomial or gaussian but here we select RBF(a #gaussian type) kernel.
regressor = SVR(kernel='poly')
regressor.fit(x_train,y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [51]:
#Prediction of test set
#Predicting the Test and Train set result 
y_pred_svr= regressor.predict(x_test)  
x_pred_svr= regressor.predict(x_train) 

In [52]:
print("Prediction for test set: {}".format(y_pred_svr))

Prediction for test set: [0.22499145 0.32224967 0.12160895 ... 0.82824897 0.43634407 0.30857633]


In [53]:
#Actual value and the predicted value
svr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_svr})
svr_diff

Unnamed: 0,Actual value,Predicted value
9316,0.217121,0.224991
1377,0.159996,0.322250
5593,0.076168,0.121609
10026,0.372940,0.271908
7733,0.807135,0.819555
...,...,...
8437,0.069784,0.097871
5261,0.673356,0.680181
6175,0.834665,0.828249
10674,0.508235,0.436344


In [54]:
# print the R-squared value for the model
print('R squared value of the model: {:.2f}'.format(regressor.score(x,y)*100))

R squared value of the model: 90.70


In [55]:
#Model Evaluation
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_svr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_svr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_svr))
print('R squared: {:.2f}'.format(regressor.score(x,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 90.70
Mean Absolute Error: 0.06868089495889608
Mean Square Error: 0.007748037028886417
Root Mean Square Error: 0.08802293467549475


In [56]:
# By polynomial degree 2 kernel

In [57]:
#4 Fitting the Support Vector Regression Model to the dataset
# Create your support vector regressor here
from sklearn.svm import SVR
# most important SVR parameter is Kernel type. It can be #linear,polynomial or gaussian SVR. We have a non-linear condition #so we can select polynomial or gaussian but here we select RBF(a #gaussian type) kernel.
regressor = SVR(C=1.0, cache_size=200, coef0=0.0, degree=2, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
regressor.fit(x_train,y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=2, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [58]:
#Prediction of test set
#Predicting the Test and Train set result 
y_pred_svr= regressor.predict(x_test)  
x_pred_svr= regressor.predict(x_train) 

In [59]:
print("Prediction for test set: {}".format(y_pred_svr))

Prediction for test set: [0.20473557 0.2785188  0.0938074  ... 0.80301878 0.47071073 0.25269657]


In [60]:
#Actual value and the predicted value
svr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_svr})
svr_diff

Unnamed: 0,Actual value,Predicted value
9316,0.217121,0.204736
1377,0.159996,0.278519
5593,0.076168,0.093807
10026,0.372940,0.315936
7733,0.807135,0.786592
...,...,...
8437,0.069784,0.054440
5261,0.673356,0.699206
6175,0.834665,0.803019
10674,0.508235,0.470711


In [61]:
# print the R-squared value for the model
print('R squared value of the model: {:.2f}'.format(regressor.score(x,y)*100))

R squared value of the model: 96.02


In [62]:
#Model Evaluation
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_svr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_svr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_svr))
print('R squared: {:.2f}'.format(regressor.score(x,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 96.02
Mean Absolute Error: 0.043883136685631166
Mean Square Error: 0.003287629799209227
Root Mean Square Error: 0.057337856597619925


In [63]:
# Final Conclusion
# Technique                 |  R^2  |   RMSE
# SVM(rbf)                  | 96.03 | 0.05760
# SVM(linear)               | 95.34 | 0.06274
# SVM(2rd order polynomial) | 96.02 | 0.05733
# SVM(3rd order polynomial) | 90.70 | 0.08802
# From R-square and RMSE we can conclude that SVM with rbf kernel and SVM with 2rd order polynomial kernel will perform better. 