# Data cleaning and preprocessing with Pandas

## Importing Dataset and Required Libraries

In [2]:
import numpy as np
import pandas as pd
import random as rd

# Load dataset
data = pd.read_csv("car.csv")
print(f"\nShape Of Data is :{data.shape}\n")
print(f"\nFirst Five Values {data.head()}\n")


Shape Of Data is :(4345, 9)


First Five Values            Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration  Year         Model  
0          yes  1991           320  
1          yes  1999  Sprinter 212  
2          yes  2003         S 500  
3          yes  2007            Q7  
4          yes  2011         Rav 4  



1. Identify missing values in the DataFrame.

In [3]:
# Printing the missing values column wise
print(data.isnull().sum())

# We have total 4345 values in the DataFrame in which 322 are missing values. Since missing values are less , so we can drop them

Brand             0
Price           172
Body              0
Mileage           0
EngineV         150
Engine Type       0
Registration      0
Year              0
Model             0
dtype: int64


2. Drop rows with any missing values.

In [4]:

droppedRowData= data.dropna(axis=0) # axis = 0 mean drop rows with missing values. I will use this DataFrame for further tasks

# Printing the shape of newly created DataFrame
print(f"\nThe Shape of droppedRowData : {droppedRowData.shape}\n")

# Printing shapes tells that it drop 320 missing values row wise. We is correct as we can see from above cell , there are 322 values missing . 

# Printing the head() of the DataFrame
print(f"The Head Of droppedRowData:\n {droppedRowData.head()}")


The Shape of droppedRowData : (4025, 9)

The Head Of droppedRowData:
            Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration  Year         Model  
0          yes  1991           320  
1          yes  1999  Sprinter 212  
2          yes  2003         S 500  
3          yes  2007            Q7  
4          yes  2011         Rav 4  


3. Drop columns with any missing values.

In [5]:
droppedColumnData= data.dropna(axis=1) # axis = 1 mean drop columns with missing values

# Printing the shape of newly created DataFrame
print(f"\nThe Shape of droppedColumnDataa : {droppedColumnData.shape}\n")

# Printing the shape shows that two columns with missing values were dropped. This can be seen above where only two columns have missing values.

# Printing the head() of the DataFrame
print(f"The Head Of droppedColumnData:\n {droppedColumnData.head()}")


The Shape of droppedColumnDataa : (4345, 7)

The Head Of droppedColumnData:
            Brand       Body  Mileage Engine Type Registration  Year  \
0            BMW      sedan      277      Petrol          yes  1991   
1  Mercedes-Benz        van      427      Diesel          yes  1999   
2  Mercedes-Benz      sedan      358         Gas          yes  2003   
3           Audi  crossover      240      Petrol          yes  2007   
4         Toyota  crossover      120      Petrol          yes  2011   

          Model  
0           320  
1  Sprinter 212  
2         S 500  
3            Q7  
4         Rav 4  


3. Fill missing values with a specific value

In [6]:
#There are several different ways to fill up missing values , such as mean of the columns , or median of the columns.

#Here i am filling up missing values with a mean of the columns

filledData = data.copy() # Making another copy of dataset so that i can apply futher task on orignal data
# Finding mean of the columns
priceMean = filledData["Price"].mean()
print(priceMean)
engineVmean = filledData["EngineV"].mean()
print(engineVmean)

# Filling missing values in Price and EngineV with their respective means

filledData["Price"] = filledData["Price"].fillna(priceMean)
filledData["EngineV"] = filledData["EngineV"].fillna(engineVmean)

# Now Checking for missing values

print(f"The Values Of Nan after Filling Up\n{filledData.isnull().sum()}")



19418.746935058713
2.79073420738975
The Values Of Nan after Filling Up
Brand           0
Price           0
Body            0
Mileage         0
EngineV         0
Engine Type     0
Registration    0
Year            0
Model           0
dtype: int64


5. Fill missing values using forward fill and backward fill methods.

In [7]:
# Forward Fill method fill mising values with previous values
forwardFilledData = data.copy() # Making Copy of dataset
print(forwardFilledData.isnull().sum())
print(f"\nRows From 10 to 20 Before Forward Filling\n")
print(forwardFilledData.iloc[10:20]) # As We Can see 17 index is Nan
forwardFilledData.ffill(axis= 0 , inplace = True) # Row Wise Forward Fill
print(f"\n Rows From 10 to 20 After Forward Filling\n")
print(forwardFilledData.iloc[10:20]) # After Forward Fill, It fills missing value with the previous column value


# Forward Fill method fill mising values with previous values
backwardFilledData = data.copy() # Making Copy of dataset
print(backwardFilledData.isnull().sum())
print(f"\nRows From 10 to 20 Before Backward Filling\n")
print(backwardFilledData.iloc[10:20]) # As We Can see 17 index is Nan
backwardFilledData.bfill(axis= 0 , inplace = True) # Row Wise Forward Fill
print(f"\n Rows From 10 to 20 After Backward Filling\n")
print(backwardFilledData.iloc[10:20]) # After Forward Fill, It fills missing value with the previous column value



Brand             0
Price           172
Body              0
Mileage           0
EngineV         150
Engine Type       0
Registration      0
Year              0
Model             0
dtype: int64

Rows From 10 to 20 Before Forward Filling



            Brand    Price       Body  Mileage  EngineV Engine Type  \
10        Renault  11950.0      vagon      177     1.50      Diesel   
11        Renault   2500.0      sedan      260     1.79      Petrol   
12           Audi   9500.0      vagon      165     2.70         Gas   
13     Volkswagen  10500.0      sedan      100     1.80      Petrol   
14         Toyota  16000.0  crossover      250     4.70         Gas   
15        Renault   8600.0      hatch       84     1.50      Diesel   
16            BMW   2990.0      other      203     2.00      Petrol   
17  Mercedes-Benz      NaN        van      240     2.20      Diesel   
18         Toyota  26500.0  crossover       21     2.00      Petrol   
19           Audi   3500.0      vagon      250     2.50      Diesel   

   Registration  Year             Model  
10          yes  2011            Megane  
11          yes  1994                19  
12          yes  2003        A6 Allroad  
13          yes  2008         Passat B6  
14      

6. Interpolate missing values.

Q: What is interpolation ? 

Ans: Interpolation is a process of determining the unknown values that lie in between the known data points.

Formula for linear interpolation:

$ y = y_1 + (x - x_1) \frac{y_2 - y_1}{x_2 - x_1}$

In [8]:
interpolatedData= data.copy()

interpolatedData["Price"] = interpolatedData["Price"].interpolate(method='linear')
print(interpolatedData.iloc[10:20]) # As We Can see 17 index is Nan

#Note There are more methods of interplotion like 'spline', 'cubicspline',etc.

            Brand    Price       Body  Mileage  EngineV Engine Type  \
10        Renault  11950.0      vagon      177     1.50      Diesel   
11        Renault   2500.0      sedan      260     1.79      Petrol   
12           Audi   9500.0      vagon      165     2.70         Gas   
13     Volkswagen  10500.0      sedan      100     1.80      Petrol   
14         Toyota  16000.0  crossover      250     4.70         Gas   
15        Renault   8600.0      hatch       84     1.50      Diesel   
16            BMW   2990.0      other      203     2.00      Petrol   
17  Mercedes-Benz  14745.0        van      240     2.20      Diesel   
18         Toyota  26500.0  crossover       21     2.00      Petrol   
19           Audi   3500.0      vagon      250     2.50      Diesel   

   Registration  Year             Model  
10          yes  2011            Megane  
11          yes  1994                19  
12          yes  2003        A6 Allroad  
13          yes  2008         Passat B6  
14      

7. Convert a column to a different data type.



In [9]:
convertedColumnData =droppedRowData.copy()

# Converting Price from float to integer 

# Before converting float to integer
print(f"\nDatatype of column 'Price' Before Conversion\n")
print(convertedColumnData["Price"].dtype)

# Converting Price from float to integer
print(f"\nDatatype of column 'Price' After Conversion\n")
convertedColumnData["Price"] = convertedColumnData["Price"].astype(int)
print(convertedColumnData["Price"].dtype)


Datatype of column 'Price' Before Conversion

float64

Datatype of column 'Price' After Conversion

int64


8. Apply a function to transform the values of a column.

In [10]:
transformedData = droppedRowData.copy()
transformedData["Price"] = transformedData["Price"].astype(int)

#Converting Price from Dollar to PKR
transformedData["Price"] = transformedData['Price'].transform(lambda value: value * 278) # 278 is current rate of dollar
print(transformedData["Price"])
# Now Prices Transformed into PKR

#Now Coverting Registration yes and no to 1 and 0 . It converted String to int
transformedData["Registration"] = transformedData["Registration"].transform(lambda value : 1 if value == "yes" else 0)
print(transformedData["Registration"].iloc[10:20])
print(transformedData["Registration"].dtype) 






0        1167600
1        2196200
2        3697400
3        6394000
4        5087400
          ...   
4339     4976200
4340    34750000
4341     1807000
4342     2224000
4344     3753000
Name: Price, Length: 4025, dtype: int64
10    1
11    1
12    1
13    1
14    1
15    1
16    0
18    1
19    0
20    1
Name: Registration, dtype: int64
int64


9. Normalize a column using Min-Max scaling.

Why we normalize data ?

When a dataset has values of different columns at drastically different scales, it gets tough to analyze the trends and patterns and comparison of the features or columns. So, in cases where all the columns have a significant difference in their scales, are needed to be modified in such a way that all those values fall into the same scale. This process is called Scaling.There are two most common techniques of how to scale columns of Pandas dataframe – Min-Max Normalization and Standardization.





#### Formula for Min-Max Scaling

$ x' = \frac{x - min(x)}{max(x) - min(x)} $



In [20]:
#Making Dataset copy for Min-Max Scaling
normalizedData = droppedRowData.copy()

#Seperating all the columns containing values other than int and float values 
seperatedData = normalizedData.drop(columns=[ "Brand","Body","Engine Type", "Registration" ,"Year" ,"Model"])

#Applying the MinMax Scaling
print("\nNormalized Data\n")
normalizedData[seperatedData.columns] = seperatedData.transform(lambda x : (x - x.min()) / (x.max() - x.min()))
normalizedData

#Now we have normalized all the columns except Brand, Body, Engine Type, Registration, Year, Model.




Normalized Data



Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model
0,BMW,0.012024,sedan,0.282653,0.014086,Petrol,yes,1991,320
1,Mercedes-Benz,0.024382,van,0.435714,0.023141,Diesel,yes,1999,Sprinter 212
2,Mercedes-Benz,0.042418,sedan,0.365306,0.044270,Gas,yes,2003,S 500
3,Audi,0.074816,crossover,0.244898,0.036221,Petrol,yes,2007,Q7
4,Toyota,0.059118,crossover,0.122449,0.014086,Petrol,yes,2011,Rav 4
...,...,...,...,...,...,...,...,...,...
4339,Toyota,0.057782,sedan,0.035714,0.010061,Petrol,yes,2014,Corolla
4340,Mercedes-Benz,0.415498,sedan,0.009184,0.024147,Diesel,yes,2014,S 350
4341,BMW,0.019706,sedan,0.001020,0.029178,Petrol,yes,1999,535
4342,BMW,0.024716,sedan,0.197959,0.014086,Petrol,yes,1985,520


10. Standardize a column (z-score normalization).

Z-score normalization is a statistical method used to standardize a dataset by converting it to a z-score. Z-score is calculated as:

$ z = \frac{x - \mu}{\sigma} $

Where:

- $ x $ is the value to be standardized

- $\mu $ is the mean of the dataset

- $\sigma $ is the standard deviation of the dataset



In [19]:
#Making Dataset copy for Z-score standardization
standardizedData = droppedRowData.copy()

#Seperating all the columns containing values other than int and float values 
# seperatedData = standardizedData.drop(columns=[ "Brand","Body","Engine Type", "Registration" ,"Year" ,"Model"])
#Now I found an easier way to do this , so i comment above line

#Applying the Z-score standardization
print(seperatedData.columns)
standardizedData[['Price', 'Mileage', 'EngineV']] = seperatedData.transform(lambda x : (x - x.mean()) / x.std())
standardizedData


Index(['Price', 'Mileage', 'EngineV'], dtype='object')


Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model
0,BMW,-0.594688,sedan,1.097037,-0.154902,Petrol,yes,1991,320
1,Mercedes-Benz,-0.451365,van,2.547788,0.027434,Diesel,yes,1999,Sprinter 212
2,Mercedes-Benz,-0.242190,sedan,1.880443,0.452885,Gas,yes,2003,S 500
3,Audi,0.133550,crossover,0.739185,0.290809,Petrol,yes,2007,Q7
4,Toyota,-0.048509,crossover,-0.421416,-0.154902,Petrol,yes,2011,Rav 4
...,...,...,...,...,...,...,...,...,...
4339,Toyota,-0.064004,sedan,-1.243508,-0.235940,Petrol,yes,2014,Corolla
4340,Mercedes-Benz,4.084629,sedan,-1.494972,0.047694,Diesel,yes,2014,S 350
4341,BMW,-0.505595,sedan,-1.572345,0.148992,Petrol,yes,1999,535
4342,BMW,-0.447491,sedan,0.294288,-0.154902,Petrol,yes,1985,520


11. Identify duplicate rows in the DataFrame.


In [206]:
# Using orignal data to find duplicates

print(data.duplicated())

# Now i want to introduce duplicates rows in the dataframe. For this i want to make copy of orignal data
duplicatedData = data.copy()

# Adding duplicate rows at the end of the DataFrame

duplicatedData = pd.concat([duplicatedData,duplicatedData.loc[:4]],ignore_index=True)



# Now checking for duplicates
print("\nduplicatedData Head\n")
print(duplicatedData.head())
print("\nDuplicated Data Tail\n")
print(duplicatedData.tail())

# Now Starting 5 and ending 5 are same
print("\nDuplicated Data Rows\n")
print(duplicatedData.duplicated())



0       False
1       False
2       False
3       False
4       False
        ...  
4340    False
4341    False
4342    False
4343    False
4344    False
Length: 4345, dtype: bool

duplicatedData Head

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration  Year         Model  
0          yes  1991           320  
1          yes  1999  Sprinter 212  
2          yes  2003         S 500  
3          yes  2007            Q7  
4          yes  2011         Rav 4  

Duplicated Data Tail

              Brand    Price       Body  Mileage  EngineV Engine Type  \
4345            BMW   4200.0      sedan      277      2.0  

12. Drop duplicate rows.

In [143]:
duplicatedData.drop_duplicates(inplace=True)

# Now Checking for duplicates

print("\nDuplicated Data Head\n")
print(duplicatedData.head())
print("\nDuplicated Data Tail\n")
print(duplicatedData.tail())

# Now It Drops the duplicate rows


Duplicated Data Head

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration  Year         Model  
0          yes  1991           320  
1          yes  1999  Sprinter 212  
2          yes  2003         S 500  
3          yes  2007            Q7  
4          yes  2011         Rav 4  

Duplicated Data Tail

              Brand     Price   Body  Mileage  EngineV Engine Type  \
4340  Mercedes-Benz  125000.0  sedan        9      3.0      Diesel   
4341            BMW    6500.0  sedan        1      3.5      Petrol   
4342            BMW    8000.0  sedan      194      2.0      Petrol   
4343         Toyota   14200.0  

13. Drop duplicate rows based on specific columns.

In [207]:
duplicatedData= data.copy()
newData = duplicatedData.drop_duplicates(["Model"])

print(duplicatedData.shape)
print(newData.shape)
#As we can see it drops the duplicate. Rows count changes from 4345 to 312

# It saves one record for each model this can be verified by printing sum() of unique values in 'Model' column
print("\nUnique Models\n")
print(len(duplicatedData["Model"].unique()))

# On Multiple Specific columns
print("\n After Applying on Multiple Specific Columns")
newData = duplicatedData.drop_duplicates(["Model","Year"])
print(newData.shape)
# Now it drops the those duplicate rows where model and year are same.



(4345, 9)
(312, 9)

Unique Models

312

 After Applying on Multiple Specific Columns
(1471, 9)


14. Convert all string values in a column to lowercase.

In [178]:
modifiedData = data.copy()

print("\nData Before Lowercasing\n")
print(modifiedData.head())
# Fist Find All the string columns
#object is for finding string
stringColumns = modifiedData.select_dtypes(include = ['object']).columns
print(stringColumns)


# Converting all string columns to lowercase
modifiedData[stringColumns] = modifiedData[stringColumns].apply(lambda x: x.str.lower())
print("\nData After Lowercasing\n")
print(modifiedData.head())


Data Before Lowercasing

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration  Year         Model  
0          yes  1991           320  
1          yes  1999  Sprinter 212  
2          yes  2003         S 500  
3          yes  2007            Q7  
4          yes  2011         Rav 4  
Index(['Brand', 'Body', 'Engine Type', 'Registration', 'Model'], dtype='object')

Data After Lowercasing

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            bmw   4200.0      sedan      277      2.0      petrol   
1  mercedes-benz   7900.0        van      427      2.9      diesel   
2  mercedes-ben

15. Remove leading and trailing spaces from string values in a column.

In [199]:
modifiedData = data.copy()

print("\nData Before Removing Leading and Trailing Spaces\n")
print(modifiedData.head())

# Fist Find All the string columns
stringColumns = modifiedData.select_dtypes(include=['object']).columns
print(stringColumns)

# Removing leading and trailing spaces
modifiedData[stringColumns] = modifiedData[stringColumns].apply(lambda x: x.str.strip())

print("\nData After Removing Leading and Trailing Spaces\n")

print(modifiedData.head())



Data Before Removing Leading and Trailing Spaces

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration  Year         Model  
0          yes  1991           320  
1          yes  1999  Sprinter 212  
2          yes  2003         S 500  
3          yes  2007            Q7  
4          yes  2011         Rav 4  
Index(['Brand', 'Body', 'Engine Type', 'Registration', 'Model'], dtype='object')

Data After Removing Leading and Trailing Spaces

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        va

16. Replace a specific substring in a column with another substring.

In [200]:
# I am using above used 'modifiedData'

print("\nData Before Replacing\n")
print(modifiedData.head())

# I am replacing 'Mercedes-Benz' to 'Mercedes Benz' Remove hyphens

modifiedData['Brand'] = modifiedData['Brand'].apply(lambda x : x.replace("Mercedes-Benz","Mercedes Benz"))
print("\nData After Replacing\n")
print(modifiedData.head())

# As we can see 'Mercedes-Benz' replaces with 'Mercedes Benz'


Data Before Replacing

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration  Year         Model  
0          yes  1991           320  
1          yes  1999  Sprinter 212  
2          yes  2003         S 500  
3          yes  2007            Q7  
4          yes  2011         Rav 4  

Data After Replacing

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  cro

17. Extract a substring from each value in a column

In [208]:
subExtractedData = droppedRowData.copy()

print("\nData Before Extracting Substring\n")
print(subExtractedData.head())

# I will make an another column of named Brand + Model in which 2 characters of Brand and 3 characters of model are inserted

subExtractedData['Brand + Model'] = subExtractedData['Brand'].str[0:3]+" "+ subExtractedData["Model"].str[0:3]
print("\nData After Extracting Substring\n")
print(subExtractedData.head())



Data Before Extracting Substring

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration  Year         Model  
0          yes  1991           320  
1          yes  1999  Sprinter 212  
2          yes  2003         S 500  
3          yes  2007            Q7  
4          yes  2011         Rav 4  

Data After Extracting Substring

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3       

18. Convert a column to datetime format.

In [223]:
import datetime
# Using orignal copy of data

DatedData = data.copy()

print("\nData Before Converting to Datetime\n")
print(DatedData.head())

# Converting 'Year' column to datetime format
DatedData["Year"] = pd.to_datetime(DatedData["Year"], format="%Y") + pd.offsets.DateOffset(months=0, days=0) 
print("\nData After Converting to Datetime\n")

print(DatedData.head())


Data Before Converting to Datetime

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration  Year         Model  
0          yes  1991           320  
1          yes  1999  Sprinter 212  
2          yes  2003         S 500  
3          yes  2007            Q7  
4          yes  2011         Rav 4  

Data After Converting to Datetime

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3   

19. Extract year, month, and day from a datetime column.

In [225]:
modifiedData = DatedData.copy()

print("\nData Before Extracting Year, Month, and Day\n")

print(modifiedData.head())

# Extracting Year, Month, and Day from 'Year' column
modifiedData["years"] = modifiedData["Year"].dt.year 

modifiedData["months"] = modifiedData["Year"].dt.month

modifiedData["days"] = modifiedData["Year"].dt.day

print("\nData After Extracting Year, Month, and Day\n")

print(modifiedData.head())

# Now As we can see new Columns are are added named as 'years','month' and 'day'



Data Before Extracting Year, Month, and Day

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration       Year         Model  
0          yes 1991-01-01           320  
1          yes 1999-01-01  Sprinter 212  
2          yes 2003-01-01         S 500  
3          yes 2007-01-01            Q7  
4          yes 2011-01-01         Rav 4  

Data After Extracting Year, Month, and Day

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0 

20. Filter rows based on a date range.

In [231]:
# There are multiple ways to filter rows based on date range.

modifiedData = DatedData.copy()

# Method 1 (Using .loc())
filteredData = modifiedData.loc[(modifiedData["Year"]>= "1996-01-01") & (modifiedData["Year"] < "2005-01-01")]
print(filteredData.head())
print(filteredData.shape)

# Method 2 (Using DateTimeIndex)

#This will get all the records in year == 1999
filteredData = modifiedData.loc[modifiedData["Year"].dt.year == 1999]
print(filteredData.head())
print(filteredData.shape)


            Brand    Price   Body  Mileage  EngineV Engine Type Registration  \
1   Mercedes-Benz   7900.0    van      427      2.9      Diesel          yes   
2   Mercedes-Benz  13300.0  sedan      358      5.0         Gas          yes   
6             BMW   6100.0  sedan      438      2.0         Gas          yes   
9      Volkswagen   1400.0  other      212      1.8         Gas           no   
12           Audi   9500.0  vagon      165      2.7         Gas          yes   

         Year         Model  
1  1999-01-01  Sprinter 212  
2  2003-01-01         S 500  
6  1997-01-01           320  
9  1999-01-01       Golf IV  
12 2003-01-01    A6 Allroad  
(1084, 9)
            Brand   Price   Body  Mileage  EngineV Engine Type Registration  \
1   Mercedes-Benz  7900.0    van      427      2.9      Diesel          yes   
9      Volkswagen  1400.0  other      212      1.8         Gas           no   
26           Audi  6500.0  sedan      330      2.4      Petrol          yes   
70           

21. Convert a categorical column to numerical using one-hot encoding

Why Encoding is needed ? 

Encoding categorical variables is an essential data preprocessing step for machine learning as most algorithms require numerical input.

What is one-hot encoding ? 

One-hot encoding creates binary columns for each category, with 1 in the column for the category record and 0 for all other records.



In [237]:
newData = droppedRowData.copy()

# One-hot encoding for categorical columns
# I am only working on column 'Body'
print("\nTotal numbers of categories\n")
print(newData["Body"].nunique())
print("\nCategories in 'Body'\n")
print(newData["Body"].unique()) 

# Creating dummies for 'Body'
encodedData = pd.get_dummies(newData,columns=['Body'])

print("\nData After One-hot Encoding\n")

print(encodedData.head())

#As we can see 4 6 new columns added to the encodedData along with previous columns from newData named Body_crossover,  Body_hatch,  Body_other,  Body_sedan,Body_vagon  Body_van


Total numbers of categories

6

Categories in 'Body'

['sedan' 'van' 'crossover' 'vagon' 'other' 'hatch']

Data After One-hot Encoding

           Brand    Price  Mileage  EngineV Engine Type Registration  Year  \
0            BMW   4200.0      277      2.0      Petrol          yes  1991   
1  Mercedes-Benz   7900.0      427      2.9      Diesel          yes  1999   
2  Mercedes-Benz  13300.0      358      5.0         Gas          yes  2003   
3           Audi  23000.0      240      4.2      Petrol          yes  2007   
4         Toyota  18300.0      120      2.0      Petrol          yes  2011   

          Model  Body_crossover  Body_hatch  Body_other  Body_sedan  \
0           320           False       False       False        True   
1  Sprinter 212           False       False       False       False   
2         S 500           False       False       False        True   
3            Q7            True       False       False       False   
4         Rav 4            True       F

22. Convert a categorical column to numerical using label encoding.

In [240]:
# using 'droppedRowData'
newData = droppedRowData.copy()

# Label encoding for categorical columns
# I am here also choosing column 'Body' for encoding

#in below line , first i change type to category than use cat.codes which assign unique integer number to each category
newData['Labels'] = newData['Body'].astype('category').cat.codes
newData.head()

# As we can see new column 'Labels' is added to the newData along with previous columns from newData.


Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model,Labels
0,BMW,4200.0,sedan,277,2.0,Petrol,yes,1991,320,3
1,Mercedes-Benz,7900.0,van,427,2.9,Diesel,yes,1999,Sprinter 212,5
2,Mercedes-Benz,13300.0,sedan,358,5.0,Gas,yes,2003,S 500,3
3,Audi,23000.0,crossover,240,4.2,Petrol,yes,2007,Q7,0
4,Toyota,18300.0,crossover,120,2.0,Petrol,yes,2011,Rav 4,0


23. Group values in a categorical column and create a new column with grouped categories

In [245]:
# We have total 6 types of car body types 'Van,Sedan,crossover,vagon,hatch and other'
# I am making 2 groups 1 is consist of 'sedan' and 'hatch' called 'Compacts'
# And 2nd group consist every other body type and will be called multipurpose

newData = droppedRowData.copy()

# Creating a new column 'Grouped Category' to hold the categorized body type
newData['Grouped Category'] = newData['Body'].replace({"sedan":"Compact",'hatch':"Compact","van":"Multipurpose","vagon":"Multipurpose","crossover":"Multipurpose","other":"Multipurpose"})
newData.head()

# As we can see new column 'Grouped Category' is added to the newData along with previous columns from newData.

Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model,Grouped Category
0,BMW,4200.0,sedan,277,2.0,Petrol,yes,1991,320,Compact
1,Mercedes-Benz,7900.0,van,427,2.9,Diesel,yes,1999,Sprinter 212,Multipurpose
2,Mercedes-Benz,13300.0,sedan,358,5.0,Gas,yes,2003,S 500,Compact
3,Audi,23000.0,crossover,240,4.2,Petrol,yes,2007,Q7,Multipurpose
4,Toyota,18300.0,crossover,120,2.0,Petrol,yes,2011,Rav 4,Multipurpose


24. Merge two DataFrames based on a common column.

In [246]:
# Since i don't have 2 different datasets , so i wll make 2 copies and merge them
copy1 = newData.copy()
copy2 = droppedRowData.copy()

# Merging both copies on 'Brand' column

mergedData = pd.merge(copy1, copy2, on='Brand')
mergedData.head()

# This operation merges two datasets. Columns that exist in both datasets will have their names suffixed with '_x' for the first dataset and '_y' for the second dataset.
# The 'Grouped Category' column remains unchanged, as it is a distinct column that is only present in copy1.




Unnamed: 0,Brand,Price_x,Body_x,Mileage_x,EngineV_x,Engine Type_x,Registration_x,Year_x,Model_x,Grouped Category,Price_y,Body_y,Mileage_y,EngineV_y,Engine Type_y,Registration_y,Year_y,Model_y
0,BMW,4200.0,sedan,277,2.0,Petrol,yes,1991,320,Compact,4200.0,sedan,277,2.0,Petrol,yes,1991,320
1,BMW,4200.0,sedan,277,2.0,Petrol,yes,1991,320,Compact,6100.0,sedan,438,2.0,Gas,yes,1997,320
2,BMW,4200.0,sedan,277,2.0,Petrol,yes,1991,320,Compact,2990.0,other,203,2.0,Petrol,no,2001,318
3,BMW,4200.0,sedan,277,2.0,Petrol,yes,1991,320,Compact,21500.0,other,72,3.0,Petrol,yes,2007,Z4
4,BMW,4200.0,sedan,277,2.0,Petrol,yes,1991,320,Compact,28500.0,crossover,160,4.8,Gas,yes,2008,X5


25. Concatenate two DataFrames vertically.

In [254]:
concatenateData1 = droppedRowData.copy()
concatenateData2 = droppedRowData.copy()

# Concatenating both copies vertically

concatenatedData = pd.concat([concatenateData1, concatenateData2], ignore_index=True ,axis=0)
concatenatedData.shape
print(concatenatedData.head())
print(concatenatedData.tail())


# This operation combines two datasets vertically. The Number of records doubled.



           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration  Year         Model  
0          yes  1991           320  
1          yes  1999  Sprinter 212  
2          yes  2003         S 500  
3          yes  2007            Q7  
4          yes  2011         Rav 4  
              Brand     Price   Body  Mileage  EngineV Engine Type  \
8045         Toyota   17900.0  sedan       35      1.6      Petrol   
8046  Mercedes-Benz  125000.0  sedan        9      3.0      Diesel   
8047            BMW    6500.0  sedan        1      3.5      Petrol   
8048            BMW    8000.0  sedan      194      2.0      Petrol   
8049   

26. Concatenate two DataFrames horizontally.

In [257]:
concatenateData1=droppedRowData.copy()
concatenateData2=droppedRowData.copy()

# Concatenating both copies horizontally

concatenatedData = pd.concat([concatenateData1 , concatenateData2],axis= 1 , ignore_index= False)
print("\nThe Shape Of Dataframe\n")
print(concatenatedData.shape)

print("\nThe First Five Records After Concatenation\n")

print(concatenatedData.head())

# This operation combines two datasets horizontally. The Number of columns doubled.



The Shape Of Dataframe

(4025, 18)

The First Five Records After Concatenation

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration  Year         Model          Brand    Price       Body  \
0          yes  1991           320            BMW   4200.0      sedan   
1          yes  1999  Sprinter 212  Mercedes-Benz   7900.0        van   
2          yes  2003         S 500  Mercedes-Benz  13300.0      sedan   
3          yes  2007            Q7           Audi  23000.0  crossover   
4          yes  2011         Rav 4         Toyota  18300.0  crossover   

   Mileage  EngineV Engine Type Registration  Year         

27. Create a new column based on existing columns.

In [None]:
# This task already done above . I am just copy pasting that code

newData = droppedRowData.copy()

# Creating a new column 'Grouped Category' to hold the categorized body type
newData['Grouped Category'] = newData['Body'].replace({"sedan":"Compact",'hatch':"Compact","van":"Multipurpose","vagon":"Multipurpose","crossover":"Multipurpose","other":"Multipurpose"})
newData.head()

# in this code i created a "Grouped Category" column by using the existing column 'Body'.

28. Discretize a continuous column into bins.

In [266]:
binnedData = droppedRowData.copy()

# Discretizing 'Price' column into 5 equal bins

# Method 1 (Using qcut())
binnedData["PriceBins"] = pd.qcut(binnedData["Price"],q=5)

print("\nData After Binning\n")

print(binnedData.head())

print("\nFrequency Of Data In Each Bin\n")

print(binnedData["PriceBins"].value_counts())

# Method 2 (Using cut())

binnedData = droppedRowData.copy()

# Discretizing 'Price' column into 3 equal bins and label them 1,2,3

binnedData["PriceBins"] = pd.cut(binnedData["Price"], bins=3, labels=[1,2,3])

print("\nData After Binning\n")

print(binnedData.head())


Data After Binning

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan      358      5.0         Gas   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   

  Registration  Year         Model           PriceBins  
0          yes  1991           320   (599.999, 6000.0]  
1          yes  1999  Sprinter 212    (6000.0, 9400.0]  
2          yes  2003         S 500   (9400.0, 13999.0]  
3          yes  2007            Q7  (13999.0, 25500.0]  
4          yes  2011         Rav 4  (13999.0, 25500.0]  

Frequency Of Data In Each Bin

PriceBins
(599.999, 6000.0]      808
(9400.0, 13999.0]      807
(13999.0, 25500.0]     805
(6000.0, 9400.0]       804
(25500.0, 300000.0]    801
Name: count, dtype: int64

Data After B

29. Create polynomial features from existing numerical columns.


In [295]:
# Importing Important Libraries

from sklearn.preprocessing import PolynomialFeatures

# I am using 'Price' and 'Mileage' columns for creating polynomial features

realpolynomialData = droppedRowData.copy()

# Creating polynomial features with degree 2 (quadratic)
polynomialFeatures = PolynomialFeatures(degree=2)

# Fitting and transforming 'Price' and 'Mileage' columns
polyFeaturesData = polynomialFeatures.fit_transform(realpolynomialData[['Price', 'Mileage']])

# Getting the names of the new polynomial features
polyFeatureNames = polynomialFeatures.get_feature_names_out(['Price', 'Mileage'])

# Converting the transformed polynomial features into a DataFrame and dropping the original 'Price' and 'Mileage' columns
polynomialData = pd.DataFrame(polyFeaturesData, columns=polyFeatureNames)
polynomialData.drop(["Price","Mileage"],axis=1,inplace=True)

# Adding the polynomial features to the original DataFrame
polynomialData=pd.concat([realpolynomialData,polynomialData],axis = 1)

print("\nData After Polynomial Features\n")
print(polynomialData.head())



Data After Polynomial Features

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan    277.0      2.0      Petrol   
1  Mercedes-Benz   7900.0        van    427.0      2.9      Diesel   
2  Mercedes-Benz  13300.0      sedan    358.0      5.0         Gas   
3           Audi  23000.0  crossover    240.0      4.2      Petrol   
4         Toyota  18300.0  crossover    120.0      2.0      Petrol   

  Registration    Year         Model    1      Price^2  Price Mileage  \
0          yes  1991.0           320  1.0   17640000.0      1163400.0   
1          yes  1999.0  Sprinter 212  1.0   62410000.0      3373300.0   
2          yes  2003.0         S 500  1.0  176890000.0      4761400.0   
3          yes  2007.0            Q7  1.0  529000000.0      5520000.0   
4          yes  2011.0         Rav 4  1.0  334890000.0      2196000.0   

   Mileage^2  
0    76729.0  
1   182329.0  
2   128164.0  
3    57600.0  
4    14400.0  


# Extra Practice Questions

27. Sort a DataFrame by multiple columns.


In [296]:
# I am sorting the data first by 'Year' in descending order and then by 'Price' in ascending order

sortedData = droppedRowData.copy()
sortedData = sortedData.sort_values(by=['Year', 'Price'], ascending=[False, True])
print("\nData After Sorting\n")
print(sortedData.head())

# As we can see, rows are sorted first by 'Year' in descending order and then by 'Price' in ascending order.


Data After Sorting

           Brand     Price   Body  Mileage  EngineV Engine Type Registration  \
1147     Renault   9658.94  sedan        1      1.2      Petrol          yes   
2432      Toyota  10500.00  hatch       17      1.0      Petrol          yes   
1999     Renault  11403.99  sedan        0      1.2      Petrol          yes   
2801      Toyota  11707.80  hatch        1      1.0      Petrol          yes   
4080  Volkswagen  12114.35  sedan        0      1.6      Petrol          yes   

      Year  Model  
1147  2016  Logan  
2432  2016  Yaris  
1999  2016  Logan  
2801  2016  Yaris  
4080  2016   Polo  


28. Find the sum of values in a column.

In [297]:
# finding the sum of 'Price' column

sumOfPrice = droppedRowData['Price'].sum()
print("\nSum of Price\n")
print(sumOfPrice)



Sum of Price

78698039.96000001


29. Find the mean of values in a column.




In [302]:
# finding the mean of 'Price' column

meanOfPrice = droppedRowData['Price'].mean()
print("\nMean of Price\n")
print(meanOfPrice)


Mean of Price

19552.308064596276


30. Find the median of values in a column.



In [301]:
# finding the median of 'Price' column

medianOfPrice = droppedRowData['Price'].median()
print("\nMedian of Price\n")
print(medianOfPrice)


Median of Price

11500.0


31. Find the mode of values in a column.




In [314]:
#finding the mode of 'Price' column

modeOfPrice = droppedRowData['Price'].mode()
print("\nMode of Price\n")
print(modeOfPrice)


Mode of Price

0    6500.0
Name: Price, dtype: float64


32. Find the standard deviation of values in a column.


In [303]:

# finding the standard deviation of 'Price' column

stdDeviationOfPrice = droppedRowData['Price'].std()
print("\nStandard Deviation of Price\n")
print(stdDeviationOfPrice)



Standard Deviation of Price

25815.73498820784


33. Find the variance of values in a column.


In [304]:

# finding the variance of 'Price' column

varianceOfPrice = droppedRowData['Price'].var()
print("\nVariance of Price\n")
print(varianceOfPrice)



Variance of Price

666452172.9813786


34. Drop rows based on a condition.


In [305]:

# dropping the rows where 'Year' is less than 2010

droppedData = droppedRowData[droppedRowData['Year'] >= 2010]
print("\nDropped Data\n")
print(droppedData.head())



Dropped Data

            Brand     Price       Body  Mileage  EngineV Engine Type  \
4          Toyota   18300.0  crossover      120      2.0      Petrol   
5   Mercedes-Benz  199999.0  crossover        0      5.5      Petrol   
8         Renault   10799.0      vagon      193      1.5      Diesel   
10        Renault   11950.0      vagon      177      1.5      Diesel   
15        Renault    8600.0      hatch       84      1.5      Diesel   

   Registration  Year   Model  
4           yes  2011   Rav 4  
5           yes  2016  GLS 63  
8           yes  2012  Megane  
10          yes  2011  Megane  
15          yes  2012    Clio  


35. Drop columns based on a condition.


In [306]:

# I am dropping the 'Body' column

droppedColumnData = droppedRowData.drop('Body', axis=1)
print("\nDropped Column Data\n")
print(droppedColumnData.head())



Dropped Column Data

           Brand    Price  Mileage  EngineV Engine Type Registration  Year  \
0            BMW   4200.0      277      2.0      Petrol          yes  1991   
1  Mercedes-Benz   7900.0      427      2.9      Diesel          yes  1999   
2  Mercedes-Benz  13300.0      358      5.0         Gas          yes  2003   
3           Audi  23000.0      240      4.2      Petrol          yes  2007   
4         Toyota  18300.0      120      2.0      Petrol          yes  2011   

          Model  
0           320  
1  Sprinter 212  
2         S 500  
3            Q7  
4         Rav 4  


36. Calculate the sum, mean, median, variance, and standard deviation of values in a column grouped by another column.


In [317]:

newData = droppedRowData.copy()
groupedSum = newData.groupby('Brand')['Price'].sum()
groupedMean = newData.groupby('Brand')['Price'].mean()
groupedMedian = newData.groupby('Brand')['Price'].median()
groupedVariance = newData.groupby('Brand')['Price'].var()
groupedStandartDev = newData.groupby('Brand')['Price'].std()



print("\nGrouped Sum\n")
print(groupedSum)

print("\nGrouped Mean\n")
print(groupedMean)

print("\nGrouped Median\n")
print(groupedMedian)

print("\nGrouped Variance\n")
print(groupedVariance)


print("\nGrouped Standard Deviation\n")
print(groupedStandartDev)


Grouped Sum

Brand
Audi              7940770.00
BMW              15134836.25
Mercedes-Benz    25369636.00
Mitsubishi        3500830.30
Renault           3589669.92
Toyota           11614109.21
Volkswagen       11548188.28
Name: Price, dtype: float64

Grouped Mean

Brand
Audi             18906.595238
BMW              23648.181641
Mercedes-Benz    30825.803159
Mitsubishi       11403.356026
Renault           8066.673978
Toyota           22772.763157
Volkswagen       13122.941227
Name: Price, dtype: float64

Grouped Median

Brand
Audi             12100.00
BMW              14945.00
Mercedes-Benz    14500.00
Mitsubishi        9500.00
Renault           7650.00
Toyota           16586.25
Volkswagen       10000.00
Name: Price, dtype: float64

Grouped Mode

Brand
Audi              3500.0
BMW               6500.0
Mercedes-Benz     5500.0
Mitsubishi        9200.0
Renault          10500.0
Toyota           12500.0
Volkswagen        9000.0
Name: Price, dtype: float64

Grouped Variance

Brand
Audi    

37. Calculate the count of occurrences of each unique value in a column grouped by another column.



In [320]:
# finding the count of occurrences of each unique value in 'Brand' column grouped by 'Year'
groupedCountData = newData.groupby('Brand')['Year'].value_counts()
print("\nGrouped Count Data\n")
print(groupedCountData)


Grouped Count Data

Brand       Year
Audi        2007    28
            2003    27
            2013    27
            2010    25
            2011    23
                    ..
Volkswagen  1984     3
            1986     3
            1980     1
            1982     1
            1985     1
Name: count, Length: 225, dtype: int64


38. Calculate the correlation coefficient between two columns grouped by another column.



In [347]:
# finding correlation coefficient between every numeric column with every other numeric column

numericaColumns = droppedRowData.select_dtypes(include='number')

correlationWithPrice = numericaColumns.corr()['Price']
print("\nCorrelation with Price\n")
print(correlationWithPrice)

correlationWithMileage = numericaColumns.corr()['Mileage']
print("\nCorrelation with Mileage\n")
print(correlationWithMileage)

correlationWithYear = numericaColumns.corr()['Year']
print("\nCorrelation with Year\n")
print(correlationWithYear)

correlationWithEngineV = numericaColumns.corr()["EngineV"]
print("\nCorrelation with EngineV\n")
print(correlationWithEngineV)


Correlation with Price

Price      1.000000
Mileage   -0.473523
EngineV    0.058664
Year       0.485734
Name: Price, dtype: float64

Correlation with Mileage

Price     -0.473523
Mileage    1.000000
EngineV    0.024935
Year      -0.664573
Name: Mileage, dtype: float64

Correlation with Year

Price      0.485734
Mileage   -0.664573
EngineV   -0.030321
Year       1.000000
Name: Year, dtype: float64

Correlation with EngineV

Price      0.058664
Mileage    0.024935
EngineV    1.000000
Year      -0.030321
Name: EngineV, dtype: float64


39. Calculate the correlation coefficient between two columns grouped by another column.


In [358]:

# I am calculating the correlation coefficient between 'Price' and 'Year' column grouped by 'Brand'
newData = droppedRowData.copy()

groupedCorrelationData = newData.groupby('Brand').apply(lambda x : x[['Price', 'Year']].corr().iloc[0,1])

print("\nGrouped Correlation Data\n")

print(groupedCorrelationData)

#The correlation coefficients indicate that, generally, newer vehicles tend to be priced higher, but the degree to which this is true varies by brand.
# This information can be useful for understanding market trends and for making informed decisions in pricing, purchasing, or sales strategies for different brands.


Grouped Correlation Data

Brand
Audi             0.745610
BMW              0.784456
Mercedes-Benz    0.560679
Mitsubishi       0.667733
Renault          0.715568
Toyota           0.485143
Volkswagen       0.596831
dtype: float64


  groupedCorrelationData = newData.groupby('Brand').apply(lambda x : x[['Price', 'Year']].corr().iloc[0,1])
