# Machine Learning Basics

Machine Learning - Branch of AI deals with Data

Types
1. Supervised Machine Learning eg Labeled Data X, y 
2. Unsupervised ML X _ y
3. Reinforced ML 






# Load the required libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error


# Load the data

In [4]:
data = pd.read_csv("auto-mpg.csv")
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [5]:
data.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

# SPlit into X and y

In [6]:
X = data.drop(["mpg", "car name"], axis=1)
y = data[["mpg"]]


In [7]:
X.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,8,307.0,130,3504,12.0,70,1
1,8,350.0,165,3693,11.5,70,1
2,8,318.0,150,3436,11.0,70,1
3,8,304.0,150,3433,12.0,70,1
4,8,302.0,140,3449,10.5,70,1


In [8]:
y.head()

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0


# Split into train and test sets 

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)



In [10]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

313 79 313 79


In [11]:
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
6,8,454.0,220,4354,9.0,70,1
292,4,86.0,65,1975,15.2,79,3
121,4,121.0,110,2660,14.0,73,2
170,4,90.0,71,2223,16.5,75,2
107,4,97.0,88,2279,19.0,73,3


In [16]:
log_hp = pd.DataFrame(np.log(X_train[["horsepower", "displacement", "weight"]]))
log_hp.head()

Unnamed: 0,horsepower,displacement,weight
6,5.393628,6.118097,8.37885
292,4.174387,4.454347,7.588324
121,4.70048,4.795791,7.886081
170,4.26268,4.49981,7.706613
107,4.477337,4.574711,7.731492


In [13]:
cols_to_log = ["displacement","horsepower", "weight"]
new_cols_names = ["log_disp", "log_horsep", "log_wt"]


In [17]:
from sklearn.preprocessing import FunctionTransformer
log_transformer = FunctionTransformer(np.log, validate=True)
X_train_log = pd.DataFrame(log_transformer.fit_transform(X_train[cols_to_log]), columns=new_cols_names, index=X_train.index)
X_train_log.head()

Unnamed: 0,log_disp,log_horsep,log_wt
6,6.118097,5.393628,8.37885
292,4.454347,4.174387,7.588324
121,4.795791,4.70048,7.886081
170,4.49981,4.26268,7.706613
107,4.574711,4.477337,7.731492


In [18]:
X_test_log = pd.DataFrame(log_transformer.fit_transform(X_test[cols_to_log]), columns=new_cols_names, index = X_test.index)
X_test_log.head()

Unnamed: 0,log_disp,log_horsep,log_wt
220,5.560682,4.70048,8.308938
245,4.442651,4.248495,7.635304
134,5.710427,4.941642,8.328693
147,4.787492,4.574711,7.819636
390,4.787492,4.369448,7.872836


# OneHotEncoding

In [20]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output = False)


In [26]:
X_train_ohe = pd.DataFrame(ohe.fit_transform(X_train[["origin"]]), columns = ohe.get_feature_names_out(),index=X_train.index)

In [27]:
X_train_ohe.head()


Unnamed: 0,origin_2,origin_3
6,0.0,0.0
292,0.0,1.0
121,1.0,0.0
170,1.0,0.0
107,0.0,1.0


In [28]:
X_test_ohe = pd.DataFrame(ohe.fit_transform(X_test[["origin"]]), columns=ohe.get_feature_names_out(), index=X_test.index)
X_test_ohe.head()

Unnamed: 0,origin_2,origin_3
220,0.0,0.0
245,0.0,1.0
134,0.0,0.0
147,0.0,1.0
390,0.0,0.0


# Combined DF

In [29]:
existing_cols = ["cylinders","acceleration","model year"]
X_train_ex = X_train[existing_cols]
X_test_ex = X_test[existing_cols]

X_test_ex.head()

Unnamed: 0,cylinders,acceleration,model year
220,8,19.0,77
245,4,18.6,78
134,8,14.0,74
147,4,15.0,74
390,4,18.6,82


In [30]:
X_train_final = pd.concat([X_train_ex, X_train_log, X_train_ohe], axis=1)
X_test_final = pd.concat([X_test_ex, X_test_log, X_test_ohe], axis=1)
X_train_final.head()

Unnamed: 0,cylinders,acceleration,model year,log_disp,log_horsep,log_wt,origin_2,origin_3
6,8,9.0,70,6.118097,5.393628,8.37885,0.0,0.0
292,4,15.2,79,4.454347,4.174387,7.588324,0.0,1.0
121,4,14.0,73,4.795791,4.70048,7.886081,1.0,0.0
170,4,16.5,75,4.49981,4.26268,7.706613,1.0,0.0
107,4,19.0,73,4.574711,4.477337,7.731492,0.0,1.0


In [31]:
X_test_final.head()

Unnamed: 0,cylinders,acceleration,model year,log_disp,log_horsep,log_wt,origin_2,origin_3
220,8,19.0,77,5.560682,4.70048,8.308938,0.0,0.0
245,4,18.6,78,4.442651,4.248495,7.635304,0.0,1.0
134,8,14.0,74,5.710427,4.941642,8.328693,0.0,0.0
147,4,15.0,74,4.787492,4.574711,7.819636,0.0,1.0
390,4,18.6,82,4.787492,4.369448,7.872836,0.0,0.0


In [32]:
y_train.head()

Unnamed: 0,mpg
6,14.0
292,34.1
121,24.0
170,25.0
107,20.0


In [33]:
y_test.head()

Unnamed: 0,mpg
220,17.0
245,39.4
134,16.0
147,24.0
390,28.0
