In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('UG-Transformer-37.5kVA.csv')
df = df.drop(['ID'],axis=1)
df.head()

Unnamed: 0,Age,InfraredScanResults,VisualConditions,Pad,Loading,HealthIndex
0,45,0.9,Serious Defects,Serious Defects,53.26,0.8
1,23,0.04,Moderate Defects,Moderate Defects,21.3,4.0
2,16,0.01,Minor Defects,Moderate Defects,24.9,5.0
3,46,0.99,Serious Defects,Serious Defects,55.59,0.8
4,33,0.22,Significant Defects,Serious Defects,34.46,2.1


In [3]:
df.describe()

Unnamed: 0,Age,InfraredScanResults,Loading,HealthIndex
count,4640.0,4640.0,4640.0,4640.0
mean,31.323276,0.35697,36.869252,2.595409
std,11.606625,0.357618,10.920859,1.44578
min,15.0,0.01,18.0,0.1
25%,18.0,0.01,27.5675,1.3
50%,33.0,0.21,36.925,2.6
75%,43.0,0.68,46.0225,3.9
max,46.0,1.0,56.0,5.0


In [4]:
df.columns

Index(['Age', 'InfraredScanResults', 'VisualConditions', 'Pad', 'Loading',
       'HealthIndex'],
      dtype='object')

In [5]:
print("Maximum Loading :",df['Loading'].max())
print("Maximum Loading :",df['Loading'].min()) 
print("Maximum Loading :",df['Loading'].mean())

Maximum Loading : 56.0
Maximum Loading : 18.0
Maximum Loading : 36.86925215517246


In [6]:
df['Pad'].value_counts()

Serious Defects        2474
Moderate Defects       1437
Significant Defects     729
Name: Pad, dtype: int64

In [7]:
df['VisualConditions'].value_counts()

Serious Defects        1928
Significant Defects    1008
Minor Defects           990
Moderate Defects        714
Name: VisualConditions, dtype: int64

In [8]:
df.isnull().sum()

Age                    0
InfraredScanResults    0
VisualConditions       0
Pad                    0
Loading                0
HealthIndex            0
dtype: int64

In [9]:
df["VisualConditions"].replace({"Moderate Defects":0 ,"Minor Defects":1 ,"Significant Defects":2,"Serious Defects":3} , inplace=True)
df["Pad"].replace({"Significant Defects":0 ,"Moderate Defects":1 ,"Serious Defects":2} , inplace=True)

In [10]:
df.head()

Unnamed: 0,Age,InfraredScanResults,VisualConditions,Pad,Loading,HealthIndex
0,45,0.9,3,2,53.26,0.8
1,23,0.04,0,1,21.3,4.0
2,16,0.01,1,1,24.9,5.0
3,46,0.99,3,2,55.59,0.8
4,33,0.22,2,2,34.46,2.1


In [11]:
X = df.drop(columns=['HealthIndex']).values
y = np.log(df['HealthIndex'])

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=2)

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error

In [14]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR

In [15]:
X_train

array([[1.600e+01, 1.000e-02, 0.000e+00, 1.000e+00, 4.108e+01],
       [4.600e+01, 9.200e-01, 3.000e+00, 2.000e+00, 4.762e+01],
       [4.600e+01, 9.600e-01, 3.000e+00, 2.000e+00, 5.600e+01],
       ...,
       [3.800e+01, 3.900e-01, 3.000e+00, 2.000e+00, 3.331e+01],
       [3.300e+01, 2.000e-01, 2.000e+00, 2.000e+00, 1.924e+01],
       [4.500e+01, 8.900e-01, 3.000e+00, 2.000e+00, 3.124e+01]])

### Linear regression

In [16]:
pipe = LinearRegression()
pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))

R2 score 0.8131902733193437
MAE 0.23392612846754798


### Exporting the Model

In [17]:
import pickle

pickle.dump(df,open('df.pkl','wb'))
pickle.dump(pipe,open('pipe.pkl','wb'))