In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from pandas_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV
from sklearn.model_selection import train_test_split
import seaborn as sns

%matplotlib inline

In [2]:
df = pd.read_csv('ai4i2020.csv')

In [3]:
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [4]:
# prof_rep = ProfileReport(df)

In [5]:
# prof_rep.to_file('Aircraft.html')

In [6]:
y = df[['Air temperature [K]']]

In [7]:
y

Unnamed: 0,Air temperature [K]
0,298.1
1,298.2
2,298.1
3,298.2
4,298.2
...,...
9995,298.8
9996,298.9
9997,299.0
9998,299.0


In [8]:
X = df[['Process temperature [K]' , 'Rotational speed [rpm]', 'Torque [Nm]','Tool wear [min]','Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']]

In [9]:
X

Unnamed: 0,Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,308.6,1551,42.8,0,0,0,0,0,0,0
1,308.7,1408,46.3,3,0,0,0,0,0,0
2,308.5,1498,49.4,5,0,0,0,0,0,0
3,308.6,1433,39.5,7,0,0,0,0,0,0
4,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9995,308.4,1604,29.5,14,0,0,0,0,0,0
9996,308.4,1632,31.8,17,0,0,0,0,0,0
9997,308.6,1645,33.4,22,0,0,0,0,0,0
9998,308.7,1408,48.5,25,0,0,0,0,0,0


In [10]:
scaler = StandardScaler()

In [11]:
arr1 = scaler.fit_transform(X)

In [12]:
arr1

array([[-0.94735989,  0.06818514,  0.28219976, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-0.879959  , -0.72947151,  0.63330802, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-1.01476077, -0.22744984,  0.94428963, ..., -0.09793424,
        -0.09948362, -0.04363046],
       ...,
       [-0.94735989,  0.59251888, -0.66077672, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-0.879959  , -0.72947151,  0.85400464, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-0.879959  , -0.2162938 ,  0.02137647, ..., -0.09793424,
        -0.09948362, -0.04363046]])

In [13]:
df1 = pd.DataFrame(arr1)

In [14]:
# df1.profile_report()

In [15]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [16]:
arr1.shape

(10000, 10)

In [17]:
vif_df = pd.DataFrame()

In [18]:
vif_df['vif'] = [variance_inflation_factor(arr1, i) for i in range(arr1.shape[1])]

In [19]:
vif_df['features'] = X.columns

In [20]:
vif_df

Unnamed: 0,vif,features
0,1.004799,Process temperature [K]
1,5.171592,Rotational speed [rpm]
2,5.236156,Torque [Nm]
3,1.039958,Tool wear [min]
4,11.829612,Machine failure
5,2.433058,TWF
6,4.597022,HDF
7,3.623946,PWF
8,3.3476,OSF
9,1.002015,RNF


In [21]:
X.drop(columns='Machine failure', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [22]:
X

Unnamed: 0,Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,308.6,1551,42.8,0,0,0,0,0,0
1,308.7,1408,46.3,3,0,0,0,0,0
2,308.5,1498,49.4,5,0,0,0,0,0
3,308.6,1433,39.5,7,0,0,0,0,0
4,308.7,1408,40.0,9,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
9995,308.4,1604,29.5,14,0,0,0,0,0
9996,308.4,1632,31.8,17,0,0,0,0,0
9997,308.6,1645,33.4,22,0,0,0,0,0
9998,308.7,1408,48.5,25,0,0,0,0,0


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=49)

In [24]:
reg = LinearRegression().fit(X_train, y_train)

In [25]:
pickle.dump(reg, open('Aircraft Temperature Prediction.pickle', 'wb'))

In [26]:
reg.intercept_

array([-64.4905531])

In [27]:
reg.coef_

array([[ 1.17461697e+00,  2.13138613e-04,  1.20050246e-04,
         1.03827728e-04,  5.12947212e-02,  1.70754142e+00,
         1.51100495e-02, -1.56702535e-01,  2.11818967e-02]])

In [28]:
# 0.7865159378244511, 0.7865549609896199, 0.7865386528191797
reg.score(X_test, y_test)

0.7864814507907821

In [29]:
# Adjusted r-square
def adj_rsquared(x, y):
    r2 = reg.score(x, y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [30]:
# 0.7852264367777531, 0.7854097159781624, 0.7855371585629694
adj_rsquared(X_test, y_test)

0.7851917414331425

In [31]:
# 298.1
reg.predict([[308.6,1551,42.8,0,0,0,0,0,0]])



array([[298.3319594]])

### lasso

In [32]:
def adj_r_squared(x, y, score):
    r2 = score
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [33]:
lassocv = LassoCV(alphas=None, cv=10, max_iter=20000000, normalize=True)
lassocv.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


LassoCV(cv=10, max_iter=20000000, normalize=True)

In [34]:
lassocv.alpha_

7.65308550697581e-05

In [35]:
lasso = Lasso(alpha=lassocv.alpha_)
lasso.fit(X_train, y_train)

Lasso(alpha=7.65308550697581e-05)

In [36]:
# 0.786474137005257, 0.7865211519941765, 0.7865211533108607
lasso_score = lasso.score(X_test, y_test)
lasso_score

0.7864741241831289

In [37]:
adj_r_squared(X_test, y_test, lasso_score)

0.7851843705708123

In [38]:
# 298.1
lasso.predict([[308.6,1551,42.8,0,0,0,0,0,0]])



array([298.33239596])

### Ridge

In [39]:
ridgecv = RidgeCV(alphas=np.random.uniform(0, 10, 50), cv=10, normalize=True)
ridgecv.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alp

RidgeCV(alphas=array([6.86305676, 5.93829021, 6.17871842, 0.95743683, 0.9771666 ,
       9.95957916, 3.92164716, 1.2402723 , 3.61117786, 4.0916913 ,
       5.69159998, 7.1403093 , 4.90287931, 6.44925323, 8.8914857 ,
       2.57275153, 6.93459823, 8.45017911, 4.29568437, 2.73080701,
       9.42310186, 9.84234116, 8.26535197, 8.56324009, 4.64333469,
       6.5968536 , 0.57373291, 2.60542908, 3.28490296, 9.14147872,
       6.04793727, 4.19663553, 4.80528723, 0.77220268, 9.42112982,
       9.99951369, 0.81751251, 7.83020935, 5.30763015, 1.38526926,
       4.61808979, 5.61540645, 4.48522358, 0.89155228, 6.49768177,
       8.76392242, 9.31823267, 5.75013174, 5.09424343, 1.16100855]),
        cv=10, normalize=True)

In [40]:
ridgecv.alpha_

0.5737329106969002

In [41]:
ridge_lr = Ridge(alpha=ridgecv.alpha_)
ridge_lr.fit(X_train, y_train)

Ridge(alpha=0.5737329106969002)

In [42]:
# 0.7865092915614106, 0.7865546207593792, 0.7865397842751567
ridge_score = ridge_lr.score(X_test, y_test)
ridge_score

0.7864868672562633

In [43]:
adj_r_squared(X_test, y_test, ridge_score)

0.7851971906155293

In [44]:
# 298.1
ridge_lr.predict([[308.6,1551,42.8,0,0,0,0,0,0]])



array([[298.33211876]])

### Elastic

In [45]:
# , max_iter=20000000
elasticcv = ElasticNetCV(alphas=None, cv = 5)
elasticcv.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


ElasticNetCV(cv=5)

In [46]:
elasticcv.alpha_

0.02304341722076116

In [47]:
elasticcv.l1_ratio_

0.5

In [48]:
elastic_lr = ElasticNet(alpha=elasticcv.alpha_, l1_ratio=elasticcv.l1_ratio_)
elastic_lr.fit(X_train, y_train)

ElasticNet(alpha=0.02304341722076116)

In [49]:
# 0.7794304010093688, 0.7857925248743414, 0.7855133071933493
elastic_score = elastic_lr.score(X_test, y_test)
elastic_score

0.7790900652107762

In [50]:
adj_r_squared(X_test, y_test, elastic_score)

0.7777557098999688

In [51]:
# 298.1
elastic_lr.predict([[308.6,1551,42.8,0,0,0,0,0,0]])



array([298.35926593])

#### Standard Scalar

In [52]:
X

Unnamed: 0,Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,308.6,1551,42.8,0,0,0,0,0,0
1,308.7,1408,46.3,3,0,0,0,0,0
2,308.5,1498,49.4,5,0,0,0,0,0
3,308.6,1433,39.5,7,0,0,0,0,0
4,308.7,1408,40.0,9,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
9995,308.4,1604,29.5,14,0,0,0,0,0
9996,308.4,1632,31.8,17,0,0,0,0,0
9997,308.6,1645,33.4,22,0,0,0,0,0
9998,308.7,1408,48.5,25,0,0,0,0,0


In [53]:
arr2 = scaler.fit_transform(X)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(arr2, y, test_size=0.15, random_state=49)

In [55]:
reg1 = LinearRegression().fit(X_train, y_train)

In [56]:
ls = reg1.score(X_test, y_test)

In [57]:
adj_r_squared(X_test, y_test, ls)

0.7851917414331415

In [58]:
test1 = scaler.transform([[308.6,1551,42.8,0,0,0,0,0,0]])



In [59]:
# 298.1
reg1.predict(test1)

array([[298.3319594]])

### Using Process Temperature

In [60]:
pt_x = X[['Process temperature [K]']]

In [61]:
pt_x

Unnamed: 0,Process temperature [K]
0,308.6
1,308.7
2,308.5
3,308.6
4,308.7
...,...
9995,308.4
9996,308.4
9997,308.6
9998,308.7


In [62]:
X_train, X_test, y_train, y_test = train_test_split(pt_x, y, test_size=0.15, random_state=49)

In [63]:
reg3 = LinearRegression().fit(X_train, y_train)

In [64]:
r3 = reg3.score(X_test, y_test)

In [65]:
adj_r_squared(X_test, y_test, r3)

0.7758271375865506

In [66]:
# 298.1
reg3.predict([[308.6]])



array([[298.34671639]])

### using Process temperature [K]	Rotational speed [rpm]	Torque [Nm]	Tool wear [min]

In [67]:
n_x = X[['Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']]

In [68]:
X_train, X_test, y_train, y_test = train_test_split(n_x, y, test_size=0.15, random_state=49)
reg3 = LinearRegression().fit(X_train, y_train)

In [69]:
r3 = reg3.score(X_test, y_test)
adj_r_squared(X_test, y_test, r3)

0.7752543899366874

In [70]:
# 298.1
reg3.predict([[308.6,1551,42.8,0,]])



array([[298.34737339]])

### using Process temperature [K] Rotational speed [rpm] Torque [Nm]

In [71]:
nn_x = X[['Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]']]

In [72]:
X_train, X_test, y_train, y_test = train_test_split(nn_x, y, test_size=0.15, random_state=49)
reg5 = LinearRegression().fit(X_train, y_train)

In [73]:
r5 = reg5.score(X_test, y_test)
adj_r_squared(X_test, y_test, r3)

0.775404620959457

In [74]:
# 298.1
reg5.predict([[308.6,1551,42.8]])



array([[298.35488117]])