In [3]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv('data.csv')

In [5]:
data.head()

Unnamed: 0,origin,cylinders,displacement,horsepower,weight,acceleration,year,name,Kilometer_per_liter
0,1,8,307.0,130,3504,12.0,1970,chevrolet chevelle malibu,7.652587
1,1,8,350.0,165,3693,11.5,1970,buick skylark 320,6.377156
2,1,8,318.0,150,3436,11.0,1970,plymouth satellite,7.652587
3,1,8,304.0,150,3433,12.0,1970,amc rebel sst,6.802299
4,1,8,302.0,140,3449,10.5,1970,ford torino,7.227443


### Pre processing the data

In [6]:
data.shape

(398, 9)

In [7]:
data.dtypes

origin                   int64
cylinders                int64
displacement           float64
horsepower              object
weight                   int64
acceleration           float64
year                     int64
name                    object
Kilometer_per_liter    float64
dtype: object

In [8]:
data.isna().sum()

origin                 0
cylinders              0
displacement           0
horsepower             0
weight                 0
acceleration           0
year                   0
name                   0
Kilometer_per_liter    0
dtype: int64

In [9]:
data.replace('?', np.nan, inplace= True)

In [10]:
data.isna().sum()

origin                 0
cylinders              0
displacement           0
horsepower             6
weight                 0
acceleration           0
year                   0
name                   0
Kilometer_per_liter    0
dtype: int64

In [11]:
data['horsepower'] = data['horsepower'].astype('float')

In [12]:
data.dtypes

origin                   int64
cylinders                int64
displacement           float64
horsepower             float64
weight                   int64
acceleration           float64
year                     int64
name                    object
Kilometer_per_liter    float64
dtype: object

In [13]:
data['horsepower'].mean()

104.46938775510205

In [14]:
data['horsepower'].fillna(int(data['horsepower'].mean()), inplace=True)

In [15]:
data.isna().sum()

origin                 0
cylinders              0
displacement           0
horsepower             0
weight                 0
acceleration           0
year                   0
name                   0
Kilometer_per_liter    0
dtype: int64

In [16]:
data.columns

Index(['origin', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'name', 'Kilometer_per_liter'],
      dtype='object')

In [17]:
X = data.values[:, :-2]
Y = data.values[:, -1]
Y = Y.astype(int)

In [18]:
X.shape

(398, 7)

In [19]:
Y.shape

(398,)

In [20]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
X=scaler.fit_transform(X)
#scaler.fit(X)
#X=scaler.transform(X)
print(X)

[[-0.71514478  1.49819126  1.0906037  ...  0.63086987 -1.29549834
  -1.62742629]
 [-0.71514478  1.49819126  1.5035143  ...  0.85433297 -1.47703779
  -1.62742629]
 [-0.71514478  1.49819126  1.19623199 ...  0.55047045 -1.65857724
  -1.62742629]
 ...
 [-0.71514478 -0.85632057 -0.56103873 ... -0.79858454 -1.4407299
   1.62198339]
 [-0.71514478 -0.85632057 -0.70507731 ... -0.40841088  1.10082237
   1.62198339]
 [-0.71514478 -0.85632057 -0.71467988 ... -0.29608816  1.39128549
   1.62198339]]


### Running the basic model

In [21]:
from sklearn.model_selection import train_test_split

#Split the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,
random_state=10)

In [46]:
#predicting using the Decision_Tree_Classifier
from sklearn.tree import DecisionTreeClassifier

model_DecisionTree=DecisionTreeClassifier(criterion="gini",random_state=10, max_depth = 10, min_samples_leaf = 3)

#fit the model on the data and predict the values
model_DecisionTree.fit(X_train,Y_train)
Y_pred=model_DecisionTree.predict(X_test)
#print(Y_pred)
print(list(zip(Y_pred)))

[(16,), (11,), (14,), (12,), (12,), (14,), (9,), (7,), (12,), (9,), (10,), (11,), (16,), (9,), (7,), (9,), (8,), (5,), (5,), (12,), (7,), (5,), (14,), (9,), (11,), (9,), (9,), (8,), (12,), (11,), (5,), (5,), (11,), (3,), (10,), (11,), (16,), (7,), (7,), (5,), (12,), (12,), (5,), (9,), (14,), (13,), (7,), (6,), (11,), (5,), (8,), (8,), (11,), (6,), (14,), (14,), (5,), (5,), (12,), (11,), (9,), (8,), (16,), (6,), (13,), (7,), (8,), (7,), (6,), (9,), (8,), (14,), (8,), (8,), (12,), (14,), (5,), (5,), (5,), (4,)]


### Evolution of model

In [47]:
from sklearn.metrics import r2_score,mean_squared_error
import numpy as np

r2=r2_score(Y_test,Y_pred)
print("R-squared:",r2)

rmse=np.sqrt(mean_squared_error(Y_test,Y_pred))
print("RMSE:",rmse)

adjusted_r_squared = 1 - (1-r2)*(len(Y)-1)/(len(Y)-X.shape[1]-1)
print("Adj R-square:",adjusted_r_squared)

R-squared: 0.7941507311586051
RMSE: 1.5124483462254175
Adj R-square: 0.7904560006922211
