## Logistic Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR, SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import FunctionTransformer


In [2]:
df = pd.read_csv('New_Data_FullyCLeaned.csv')
# Concatenate the text columns into a single column
df['text_combined'] = df['model'] + ' ' + df['location'] + ' ' + df['engine_type'] + ' ' + df['transmission']

X = df[['text_combined']]
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Define the column transformer with HashingVectorizer
column_trans = make_column_transformer(
    (HashingVectorizer(), 'text_combined'),  # Hashing feature extraction on the combined text
    remainder='passthrough'  # Pass through any non-text columns as they are
)

# Create a pipeline with the column transformer and linear regression
hash_lr_pipe = make_pipeline(
    column_trans,
    LinearRegression()
)

# Fit the pipeline on the training data
hash_lr_pipe.fit(X_train, y_train)

# Make predictions on the test data
y_pred_hash = hash_lr_pipe.predict(X_test)

# Evaluate the model performance
r2_hash = r2_score(y_test, y_pred_hash)
print(f'R-squared score with Hashing on the test set: {r2_hash:.2f}')

R-squared score with Hashing on the test set: 0.74


## Naive bayes Algorithm

In [4]:
# Define a function to make input non-negative
def make_non_negative(X):
    return np.abs(X)

# Define the column transformer with HashingVectorizer
column_trans = make_column_transformer(
    (HashingVectorizer(), 'text_combined'),  # Hashing feature extraction on the combined text
    remainder='passthrough'  # Pass through any non-text columns as they are
)

# Create a pipeline with the column transformer, FunctionTransformer, and Naive Bayes
hash_nb_pipe = make_pipeline(
    column_trans,
    FunctionTransformer(func=make_non_negative),  # Ensure non-negative values
    MultinomialNB()  # Multinomial Naive Bayes
)

# Fit the pipeline on the training data
hash_nb_pipe.fit(X_train, y_train)

# Make predictions on the test data
y_pred_hash_nb = hash_nb_pipe.predict(X_test)

# Evaluate the model performance
r2_hash_nb = r2_score(y_test, y_pred_hash_nb)
print(f'R-squared score with Hashing and Naive Bayes on the test set: {r2_hash_nb:.2f}')


R-squared score with Hashing and Naive Bayes on the test set: 0.05


## Random Forest Algorithm

In [None]:
# Define the column transformer with HashingVectorizer
column_trans = make_column_transformer(
    (HashingVectorizer(), 'text_combined'),  # Hashing feature extraction on the combined text
    remainder='passthrough'  # Pass through any non-text columns as they are
)

# Create a pipeline with the column transformer and RandomForestRegressor
hash_rf_pipe = make_pipeline(column_trans, RandomForestRegressor())

# Fit the pipeline on the training data
hash_rf_pipe.fit(X_train, y_train)

# Make predictions on the test data
y_pred_hash_rf = hash_rf_pipe.predict(X_test)

# Evaluate the model performance
r2_hash_rf = r2_score(y_test, y_pred_hash_rf)
print(f'R-squared score with Hashing and RandomForestRegressor on the test set: {r2_hash_rf:.2f}')

## SVM Algorithm

In [None]:
# Define the column transformer with HashingVectorizer
column_trans = make_column_transformer(
    (HashingVectorizer(), 'text_combined'),  # Hashing feature extraction on the combined text
    remainder='passthrough'  # Pass through any non-text columns as they are
)

# Create a pipeline with the column transformer, standard scaler, and SVM regressor
hash_svc_pipe = make_pipeline(
    column_trans,
    StandardScaler(with_mean=False),  # Standardize features by removing the mean and scaling to unit variance
    SVC()  # Support Vector Regression
)

# Fit the pipeline on the training data
hash_svc_pipe.fit(X_train, y_train)

# Make predictions on the test data
y_pred_hash_svc = hash_svc_pipe.predict(X_test)

# Evaluate the model performance
r2_hash_svc = r2_score(y_test, y_pred_hash_svc)
print(f'R-squared score with Hashing and SVC on the test set: {r2_hash_svc:.2f}')