In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

In [3]:
class PCA:

    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None

    def fit(self, X):
        # Mean centering
        self.mean = np.mean(X, axis=0)
        X = X - self.mean
        # covariance, function needs samples as columns
        cov = np.cov(X.T)
        # eigenvalues, eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(cov)
        # -> eigenvector v = [:,i] column vector, transpose for easier calculations
        # sort eigenvectors
        eigenvectors = eigenvectors.T
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idxs]
        eigenvectors = eigenvectors[idxs]
        # store first n eigenvectors
        self.components = eigenvectors[0:self.n_components]

    def transform(self, X):
        # project data
        X = X - self.mean
        return np.dot(X, self.components.T)

In [4]:
df= pd.read_csv('C:/Users/user/Desktop/IVY WORK BOOK/MACHINE LEARNING/Python Datasets/Regression Datasets/CarPricesData.csv')

In [9]:
df.head()

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23.0,46986,Diesel,90,1,0,2000.0,3,1165.0
1,13750,23.0,72937,Diesel,90,1,0,2000.0,3,1165.0
2,13950,24.0,41711,Diesel,90,1,0,2000.0,3,1165.0
3,14950,26.0,48000,Diesel,90,0,0,2000.0,3,1165.0
4,13750,30.0,38500,Diesel,90,0,0,2000.0,3,1170.0


In [8]:
df.isnull().sum()

Price        0
Age          0
KM           0
FuelType     0
HP           0
MetColor     0
Automatic    0
CC           0
Doors        0
Weight       0
dtype: int64

In [7]:
df=df.dropna()

In [10]:
df.columns

Index(['Price', 'Age', 'KM', 'FuelType', 'HP', 'MetColor', 'Automatic', 'CC',
       'Doors', 'Weight'],
      dtype='object')

In [18]:
predictors=['Age', 'KM','HP', 'MetColor', 'Automatic', 'CC',
       'Doors', 'Weight']

target=['Price']

X=df[predictors].values
y=df[target].values

In [19]:
pca=PCA(2)
pca.fit(X)
projected_X=pca.transform(X)

In [20]:
print("Shape of X", X.shape)
print("Shape of Projected X", projected_X.shape)

Shape of X (1426, 8)
Shape of Projected X (1426, 2)


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1426 entries, 0 to 1435
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Price      1426 non-null   int64  
 1   Age        1426 non-null   float64
 2   KM         1426 non-null   int64  
 3   HP         1426 non-null   int64  
 4   MetColor   1426 non-null   int64  
 5   Automatic  1426 non-null   int64  
 6   CC         1426 non-null   float64
 7   Doors      1426 non-null   int64  
 8   Weight     1426 non-null   float64
dtypes: float64(3), int64(6)
memory usage: 111.4 KB


In [14]:
df=df.drop(labels='FuelType', axis=1)