## Logistic Regression

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR, SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
df = pd.read_csv('New_Data_FullyCLeaned.csv')
# Concatenate the text columns into a single column
df['text_combined'] = df['model'] + ' ' + df['location'] + ' ' + df['engine_type'] + ' ' + df['transmission']

X = df[['text_combined']]
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:

# Define the column transformer
column_trans = make_column_transformer(
    (CountVectorizer(), 'text_combined'),  # BoW feature extraction on the combined text
    remainder='passthrough'  # Pass through any non-text columns as they are
)

# Create a pipeline with the column transformer and linear regression
bow_lr_pipe = make_pipeline(
    column_trans,
    LinearRegression()
)

# Fit the pipeline on the training data
bow_lr_pipe.fit(X_train, y_train)

# Make predictions on the test data
y_pred_bow = bow_lr_pipe.predict(X_test)

# Evaluate the model performance
r2_bow = r2_score(y_test, y_pred_bow)
print(f'R-squared score with BoW on the test set: {r2_bow:.2f}')

R-squared score with BoW on the test set: 0.74


## Naive bayes Algorithm

In [22]:
# Define the column transformer
column_trans = make_column_transformer(
    (CountVectorizer(), 'text_combined'),  # BoW feature extraction on the combined text
    remainder='passthrough'  # Pass through any non-text columns as they are
)

# Create a pipeline with the column transformer and Naive Bayes
bow_nb_pipe = Pipeline([
    ('bow_nb', column_trans),
    ('nb', MultinomialNB())
])

# Fit the pipeline on the training data
bow_nb_pipe.fit(X_train, y_train)

# Make predictions on the test data
y_pred_bow_nb = bow_nb_pipe.predict(X_test)

# Evaluate the model performance
r2_bow_nb = r2_score(y_test, y_pred_bow_nb)
print(f'R-squared score with BoW and Naive Bayes on the test set: {r2_bow_nb:.2f}')

R-squared score with BoW and Naive Bayes on the test set: 0.55


## Random Forest Algorithm

In [23]:
column_trans = make_column_transformer(
    (CountVectorizer(), 'text_combined'),
    remainder='passthrough'
)

# Create a pipeline with BoW and RandomForestRegressor
pipeline = make_pipeline(column_trans, RandomForestRegressor())

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model performance
r2 = r2_score(y_test, y_pred)
print(f'R-squared score on the test set: {r2:.2f}')

R-squared score on the test set: 0.73


## SVM Algorithm

In [25]:
column_trans = make_column_transformer(
    (CountVectorizer(), 'text_combined'),
    remainder='passthrough'
)

# Create a pipeline with BoW, standard scaler, and SVM regressor
pipeline = make_pipeline(
    column_trans,
    StandardScaler(with_mean=False),  # Standardize features by removing the mean and scaling to unit variance
    SVC()  # Support Vector Regression
)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model performance
r2 = r2_score(y_test, y_pred)
print(f'R-squared score on the test set: {r2:.2f}')

R-squared score on the test set: 0.38
