In [1]:
# !pip install gensim

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec


In [3]:
data = pd.read_csv('data.csv')

In [4]:
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [5]:
data.describe(include='all')

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
count,11914,11914,11914.0,11911,11845.0,11884.0,11914,11914,11908.0,8172,11914,11914,11914.0,11914.0,11914.0,11914.0
unique,48,915,,10,,,5,4,,71,3,16,,,,
top,Chevrolet,Silverado 1500,,regular unleaded,,,AUTOMATIC,front wheel drive,,Crossover,Compact,Sedan,,,,
freq,1123,156,,7172,,,8266,4787,,1110,4764,3048,,,,
mean,,,2010.384338,,249.38607,5.628829,,,3.436093,,,,26.637485,19.733255,1554.911197,40594.74
std,,,7.57974,,109.19187,1.780559,,,0.881315,,,,8.863001,8.987798,1441.855347,60109.1
min,,,1990.0,,55.0,0.0,,,2.0,,,,12.0,7.0,2.0,2000.0
25%,,,2007.0,,170.0,4.0,,,2.0,,,,22.0,16.0,549.0,21000.0
50%,,,2015.0,,227.0,6.0,,,4.0,,,,26.0,18.0,1385.0,29995.0
75%,,,2016.0,,300.0,6.0,,,4.0,,,,30.0,22.0,2009.0,42231.25


In [6]:
#Removing the NAN Values
data.dropna(inplace=True)
data.reset_index(inplace=True)
data.isnull().sum()

index                0
Make                 0
Model                0
Year                 0
Engine Fuel Type     0
Engine HP            0
Engine Cylinders     0
Transmission Type    0
Driven_Wheels        0
Number of Doors      0
Market Category      0
Vehicle Size         0
Vehicle Style        0
highway MPG          0
city mpg             0
Popularity           0
MSRP                 0
dtype: int64

### Preprocessing the by data by removing stopwords and tokenizing 

In [7]:

stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

data['Market Category'] = data['Market Category'].apply(preprocess)

data['Market Category']


0             [factory, tuner, luxury]
1                [luxury, performance]
2                             [luxury]
3                [luxury, performance]
4                             [luxury]
                     ...              
8079    [crossover, hatchback, luxury]
8080    [crossover, hatchback, luxury]
8081    [crossover, hatchback, luxury]
8082    [crossover, hatchback, luxury]
8083                          [luxury]
Name: Market Category, Length: 8084, dtype: object

###  Perform the bag-of-words approach using CountVectorizer

In [8]:
count_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
bow = count_vectorizer.fit_transform(data['Market Category'])
print(bow.toarray()[:5, :10])

[[0 0 0 1 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 1 0]]




* We can also calculate the normalized count occurrence by dividing the count by the length of each document

In [9]:
normalized_count = bow.copy()
for i, j in zip(*normalized_count.nonzero()): 
    normalized_count[i, j] = normalized_count[i, j] / len(data['Market Category'][i])
print(normalized_count.toarray()[:5, :10]) # Print the first 5 rows and 10 columns of the matrix


[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0]]


### TF-IDF to calculate the importance of each word in the document

In [10]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)
tfidf = tfidf_vectorizer.fit_transform(data['Market Category'])
print(tfidf.toarray()[:5, :10]) # Print the first 5 rows and 10 columns of the matrix


[[0.         0.         0.         0.6616931  0.         0.
  0.         0.         0.35259679 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.62784336 0.77833972]
 [0.         0.         0.         0.         0.         0.
  0.         0.         1.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.62784336 0.77833972]
 [0.         0.         0.         0.         0.         0.
  0.         0.         1.         0.        ]]


### Creating word embeddings using Word2Vec

In [11]:
model = Word2Vec(data['Market Category'], min_count=1)
embeddings = np.zeros((len(data), 100))
for i, tokens in enumerate(data['Market Category']):
    for token in tokens:
        embeddings[i] += model.wv[token]
    embeddings[i] /= len(tokens)


  embeddings[i] /= len(tokens)


In [12]:
embeddings

array([[-0.01397511,  0.0086739 ,  0.01344167, ..., -0.03780531,
         0.00934406,  0.00226696],
       [-0.02213832,  0.01241343,  0.01302692, ..., -0.0335934 ,
         0.00320444,  0.00510586],
       [-0.02166208,  0.01290463,  0.01443911, ..., -0.04168265,
         0.01012834,  0.00581397],
       ...,
       [-0.01449299,  0.00865786,  0.00388291, ..., -0.02331629,
         0.0088915 , -0.00283403],
       [-0.01449299,  0.00865786,  0.00388291, ..., -0.02331629,
         0.0088915 , -0.00283403],
       [-0.02166208,  0.01290463,  0.01443911, ..., -0.04168265,
         0.01012834,  0.00581397]])

* We have successfully created bag-of-words vectors, normalized count occurrence vectors, TF-IDF vectors, and word embeddings for the dataset.