### Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Loading the Dataset

In [2]:
df=pd.read_csv('Dataset/car_performance.csv')

### Data Analysis

In [3]:
df.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
5,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl


In [4]:
df.shape

(398, 9)

In [5]:
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    int64  
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 28.1+ KB


In [7]:
df.nunique()

mpg             129
cylinders         5
displacement     82
horsepower       93
weight          351
acceleration     95
model year       13
origin            3
car name        305
dtype: int64

In [8]:
df.origin.unique()

array([1, 3, 2], dtype=int64)

### Handiling the Missing Values

In [9]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [10]:
# There is no Null Value in the data set

### Lable encoding

In [11]:
# There is no Categorial value other than the car name (car name is not used for the performance predecting so we can drop the car name column), so we need not to do the label encoding

### Droping the car name column

In [10]:
df=df.iloc[:,:-1]

In [11]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


### Splitting the dataset into dependent and independent Variable

In [12]:
x=df.iloc[:,1:]

In [13]:
y=df.iloc[:,0]

In [14]:
x.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,8,307.0,130,3504,12.0,70,1
1,8,350.0,165,3693,11.5,70,1
2,8,318.0,150,3436,11.0,70,1
3,8,304.0,150,3433,12.0,70,1
4,8,302.0,140,3449,10.5,70,1


In [15]:
y.head()

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64

### Splitting the dataset into train and test

In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [17]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((318, 7), (80, 7), (318,), (80,))

### Normalizing the values

In [18]:
from sklearn.preprocessing import StandardScaler
sd = StandardScaler()
x_train=sd.fit_transform(x_train)
x_test=sd.fit_transform(x_test)

In [19]:
x_train

array([[ 0.31499549,  0.54322636, -0.12519295, ...,  0.52209821,
        -0.53136828, -0.72041176],
       [-0.87043134, -1.02074502, -1.17631789, ...,  1.17205258,
         1.10387475,  1.75624704],
       [-0.87043134, -0.70017946, -0.49308668, ...,  0.16101245,
        -1.07644929, -0.72041176],
       ...,
       [ 0.31499549,  0.36837241, -0.12519295, ..., -0.92224482,
        -1.3489898 , -0.72041176],
       [ 0.31499549,  0.30037366,  0.00619767, ...,  0.34155533,
        -0.53136828, -0.72041176],
       [-0.87043134, -1.0013168 , -0.99237103, ..., -0.63337621,
         1.10387475,  1.75624704]])

# Model Building

### Implementing the RandomForestRegression Algorithm

In [20]:
from sklearn.ensemble import RandomForestRegressor

In [21]:
rf = RandomForestRegressor(n_estimators=30,random_state=0)

In [22]:
rf.fit(x_train,y_train)

## Predicting the Value

In [23]:
y_pred = rf.predict(x_test)

In [24]:
y_pred

array([12.96666667, 23.62      , 20.19      , 25.13333333, 34.95      ,
       12.96666667, 27.96666667, 24.57333333, 20.29      , 29.84333333,
       26.67666667, 37.32666667, 36.93666667, 18.56333333, 14.        ,
       13.8       , 28.88      , 12.96666667, 32.9       , 27.61333333,
       19.08333333, 30.11333333, 28.04666667, 33.18666667, 22.01333333,
       31.57666667, 30.41333333, 30.45      , 12.53333333, 13.36666667,
       31.70666667, 14.6       , 13.7       , 20.9       , 26.74333333,
       14.58333333, 14.43333333, 25.20666667, 14.61666667, 28.01666667,
       19.40333333, 20.70333333, 25.84333333, 13.86666667, 33.04666667,
       23.03666667, 20.64      , 19.38      , 14.38333333, 30.1       ,
       23.22666667, 28.06      , 33.30333333, 25.58333333, 26.85      ,
       22.66      , 21.31333333, 24.57      , 23.16333333, 27.5       ,
       16.25333333, 23.80333333, 16.54333333, 26.98333333, 17.71      ,
       13.66666667, 27.33333333, 13.13333333, 26.51666667, 32.79

## Model Evaluation

In [25]:
from sklearn.metrics import r2_score,mean_squared_error

In [26]:
acc = r2_score(y_test, y_pred)

In [27]:
acc

0.7899942495532728

In [28]:
err=np.sqrt(mean_squared_error(y_test,y_pred))

In [29]:
err

3.609919435850797

### Exporting the model

In [30]:
import pickle

In [31]:
pickle.dump(rf,open('RFregression.pkl','wb'))