In [None]:
# Load the required libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import prince

In [None]:
# Task 1 - Load a dataset and extract basic information 

df = pd.read_csv('data.csv')

print(df.shape, "\n")
print(df.info(), "\n")
print(df.describe(), "\n")

for col in df.columns:
    print(col + ' ' + str(df[col].nunique()))

In [None]:
# Task 2 - Clean your data set
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All

print(df.iloc[0, 6])
df.iloc[0, 6] = df.iloc[0, 6] + ' '
print(df.iloc[0, 6])

In [None]:
print(df.groupby(['Transmission Type']).size(), '\n')
df['Transmission Type'].value_counts()

In [None]:
df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
df['Transmission Type'].value_counts()

In [None]:
print(df.isna().mean())
df.isna().mean().plot(kind='barh')
df = df.loc[:, df.isna().mean() < 0.3]

In [None]:
df = df.replace('NA', np.nan)
df = df.dropna()
df.info()

In [None]:
print(df.duplicated().sum())
df = df.drop_duplicates()
df.info()

In [None]:
# Task 3 - Visualize patterns and outliers in your data set
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All

fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x='Engine HP', y='city mpg', data=df)
plt.xticks(rotation=45)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x='Transmission Type', hue='Vehicle Style', data=df, ax=ax)
plt.xticks(rotation=45)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='Transmission Type', y='Engine HP', data=df)
plt.xticks(rotation=45)

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
sns.pairplot(df, vars=numeric_cols, hue="Number of Doors", palette='Set3')

In [None]:
# Task 4 - Find and visualize correlations in your data set
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All

df_num = df[numeric_cols]
df_corr = df_num.corr()

print(df_corr, "\n")
print(df_corr.mean(), "\n")
print(df_corr.abs().mean(), "\n")

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(df_corr, square=True, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(20, 220, n=200))
plt.xticks(rotation=45)

In [None]:
# Task 5 - Cluster your data set to identify similar groups
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All

print(df_num.columns.tolist())

X = StandardScaler().fit_transform(df_num)

kmeans = KMeans(n_clusters=5, init='random')
kmeans.fit(X)

pred = kmeans.predict(X)
np.unique(pred)

In [None]:
fig, ax = plt.subplots(figsize = (12, 8))
plt.scatter(X[:, 1], X[:, 7], c=pred, cmap='viridis')

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 1], centers[:, 7], c='grey', s=50)

In [None]:
# Task 6 - Visualize your data set with principal component analysis (PCA)
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All

pca = PCA(n_components=0.95)

pca.fit(X)
pcad = pca.transform(X)

print(pca.explained_variance_ratio_)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.scatter(pcad[:, 0], pcad[:, 1])

In [None]:
pca2 = prince.PCA(n_components=6, n_iter=3, rescale_with_mean=True, rescale_with_std=True, copy=True, engine='auto')
pca2 = pca2.fit(df_num)
pca2.explained_inertia_

In [None]:
ax = pca2.plot_row_coordinates(df_num, ax=None, figsize=(12, 8), x_component=0, y_component=1, labels=None,
                              color_labels=df['Transmission Type'],
                              ellipse_outline=False, ellipse_fill=True, show_points=True)