<a href="https://colab.research.google.com/github/Binson-1/ML-proj/blob/master/machine_learning_proj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from google.colab import files 


uploaded = files.upload()


1. Loading Data into Data Frame

In [None]:
import io
df = pd.read_csv(io.BytesIO(uploaded['data.csv']))

In [None]:
df.head()

In [None]:
df.tail()

1. Checking types of data and basic summary stats

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

2. Dropping irrelevent columns

In [None]:
#Dropping columns since it is less used or no effect on price
df = df.drop(['Engine Fuel Type','Number of Doors','Market Category'],axis = 1)
df.head()

3. Renaming the columns

In [None]:
df = df.rename(columns = {"Engine HP": "HP", "Engine Cylinders": "Cylinders", "Transmission Type": "Transmission", "Driven_Wheels": "Drive Mode","highway MPG": "MPG-H", "city mpg": "MPG-C", "MSRP": "Price"})
df.head()

4. Dropping the duplicate rows

In [None]:
duplicate_rows_df = df[df.duplicated()]
print("No. of duplicate rows:",duplicate_rows_df.shape)

In [None]:
df = df.drop_duplicates()
df.head()

In [None]:
df.shape

In [None]:
df.count()

4. Dropping the missing/null values

In [None]:
print(df.isnull().sum())

In [None]:
df = df.dropna()
df.count()

5. Detecting Outliers

In [None]:
sns.boxplot(x=df['Price'])

In [None]:
sns.boxplot(x=df['HP'])

In [None]:
sns.boxplot(x=df['Cylinders'])

5. Removing Outliers

In [None]:
q1 = df.quantile(0.25)
q3 = df.quantile(0.75)
IQR = q3 - q1
print(IQR)

In [None]:
df = df[~((df < (q1 - 1.5 * IQR)) | (df > (q3 + 1.5 * IQR))).any(axis = 1)]
df.shape

6. Top Car Brands

In [None]:
#percentage-cars/brand
counts = df['Make'].value_counts() * 100 / sum(df['Make'].value_counts())
#top 10
pop=counts.index[:10]
#graph
plt.figure(figsize=(10,5))
plt.barh(pop,width = counts[:10])
plt.title('Top 10 Brands')
plt.show()

6. Finding Average Prices

In [None]:
prices = df[['Make','Price']].loc[(df['Make'] == 'Chevrolet') | (df['Make'] == 'Ford') | (df['Make'] == 'Volkswagen') | (df['Make'] == 'Toyota') | (df['Make'] == 'Dodge') | (df['Make'] == 'Nissan') | (df['Make'] == 'GMC') | (df['Make'] == 'Honda') | (df['Make'] == 'Mazda')].groupby('Make').mean()
print(prices)

7. Correlation Matrix

In [None]:
df.corr()

In [None]:
#High correlation between 1.Cylinders & HP 2.Highway mpg & City mpg
#High anti correlation between Cylinders and Highway mpg
plt.figure(figsize = (10,5))
c = df.corr()
sns.heatmap(c,cmap = "BrBG",annot = True)

8. Scatterplot

In [None]:
fig,ax = plt.subplots(figsize = (10,6))
ax.scatter(df['HP'],df['Price'])
ax.set_xlabel('HP')
ax.set_ylabel('Price')
plt.show

Which vehicle style segment is mostly sold

In [None]:
df['Vehicle Style'].value_counts().plot.bar(figsize = (10,6))
plt.title("Cars sold by body")
plt.ylabel('No. of vehicles')
plt.xlabel('Body type')

In [None]:
#Vehicle Style type and Drive type analysis
sns.countplot(y = 'Vehicle Style',data = df,hue = 'Drive Mode')
plt.title("Vehicle type v/s Drive mode type")
plt.ylabel('Vehicle type')
plt.xlabel('Count of vehicles')

Making new group "Price_group"

In [None]:
#Creating new column 'Price_group' and assign value based on ar price
df['price_group'] = pd.cut(df['Price'],[0,20000,40000,60000,80000,100000,600000],labels = ['<20K','20-39K','40-59K','60-79K','80-99K','>100k'],include_lowest = True)
df['price_group'] = df['price_group'].astype(object)

In [None]:
(df['price_group'].value_counts() / len(df) * 100).plot.bar(figsize = (10,6))
plt.title("Price Group bar diagram")
plt.ylabel('% of vehicles')
plt.xlabel('Price Group')

9. ML Model

In [None]:
X = df[['Popularity','Year','HP','Cylinders','MPG-H','MPG-C']].values
y = df['Price'].values

In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y.reshape(-1,1))

In [None]:
#splitting dataset-trainingset,test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

In [None]:
#fitting multiple linear regression to training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [None]:
#predicting test results
y_pred = regressor.predict(X_test)
plt.scatter(y_test,y_pred)

In [None]:
sns.distplot((y_test-y_pred),bins = 50)