In [2]:
!pip install umap-learn


Collecting umap-learn
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.11-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: umap-learn
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.5-py3-none-any.whl size=86832 sha256=1137d226dd0928dfae5b1ff32b91e218aaadffd7c2dc12ae04e172005d0d48f7
  Stored in directory: /root/.cache/pip/wheels/3a/70/07/428d2b58660a1a3b431db59b806a10da736612ebbc66c1bcc5
Successfully built umap-learn
Installing collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.11 umap-learn-0.5.5


In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from umap import UMAP
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [20]:
# Load dataset
df = pd.read_csv('/content/cars_ds_final.csv')

In [21]:
for col in df.select_dtypes(include=np.number).columns:
    df[col].fillna(df[col].mean(), inplace=True)

In [22]:
# Impute missing values with mode (for categorical columns)
for col in df.select_dtypes(include=['object', 'category']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [23]:
# Handling categorical variables
df = pd.get_dummies(df)

In [24]:
# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [18]:
for col in df.select_dtypes(include=['object', 'category']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [26]:
umap = UMAP(n_components=3, random_state=42)  # 3 components as an example
df_reduced = umap.fit_transform(df_scaled)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [27]:
kmeans = KMeans(n_clusters=3, random_state=42)  # 3 clusters as an example
clusters = kmeans.fit_predict(df_reduced)



In [28]:
# Feature Extraction for Classification
# Here we use PCA for simplicity, but this can be adjusted
pca = PCA(n_components=5)  # 5 components as an example
df_features = pca.fit_transform(df_scaled)

In [29]:
# Classification
X_train, X_test, y_train, y_test = train_test_split(df_features, clusters, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

In [30]:
# Predictions and Evaluation
predictions = classifier.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       305
           1       0.90      0.90      0.90        10
           2       0.90      0.90      0.90        68

    accuracy                           0.96       383
   macro avg       0.92      0.92      0.92       383
weighted avg       0.96      0.96      0.96       383



In [32]:
#  Save the reduced dataset and clusters for further analysis
df_reduced_with_clusters = pd.DataFrame(df_reduced, columns=['UMAP1', 'UMAP2', 'UMAP3'])
df_reduced_with_clusters['Cluster'] = clusters
df_reduced_with_clusters.to_csv('reduced_data_with_clusters.csv', index=False)